#!/mnt/storage/Development/dictation-service/.venv/bin/python import os import sys import queue import json import time import subprocess import threading import sounddevice as sd from vosk import Model, KaldiRecognizer from pynput.keyboard import Controller import logging # Setup logging logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG) # Configuration MODEL_NAME = "vosk-model-en-us-0.22" SAMPLE_RATE = 16000 BLOCK_SIZE = 8000 LOCK_FILE = "listening.lock" # Global State is_listening = False keyboard = Controller() q = queue.Queue() last_partial_text = "" typing_thread = None should_type = False def send_notification(title, message, duration=2000): """Sends a system notification""" try: subprocess.run(["notify-send", "-t", str(duration), "-u", "low", title, message], capture_output=True, check=True) except (FileNotFoundError, subprocess.CalledProcessError): pass def download_model_if_needed(): """Download model if needed""" if not os.path.exists(MODEL_NAME): logging.info(f"Model '{MODEL_NAME}' not found. Downloading...") try: subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"]) subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"]) logging.info("Download complete.") except Exception as e: logging.error(f"Error downloading model: {e}") sys.exit(1) def audio_callback(indata, frames, time, status): """Audio callback""" if status: logging.warning(status) if is_listening: q.put(bytes(indata)) def process_partial_text(text): """Process and display partial results with real-time feedback""" global last_partial_text if text and text != last_partial_text: last_partial_text = text logging.info(f"💭 {text}") # Show brief notification for longer partial text if len(text) > 3: send_notification("🎤 Speaking", text[:50] + "..." if len(text) > 50 else text, 1000) def process_final_text(text): """Process and type final results immediately""" global last_partial_text, should_type if not text.strip(): return # Format and clean text formatted = text.strip() # Filter out spurious single words that are likely false positives if len(formatted.split()) == 1 and formatted.lower() in ['the', 'a', 'an', 'uh', 'huh', 'um', 'hmm']: logging.info(f"⏭️ Filtered out spurious word: {formatted}") return # Filter out very short results that are likely noise if len(formatted) < 2: logging.info(f"⏭️ Filtered out too short: {formatted}") return formatted = formatted[0].upper() + formatted[1:] if formatted else formatted logging.info(f"✅ {formatted}") # Show final result notification briefly send_notification("✅ Said", formatted, 1500) # Type the text immediately try: keyboard.type(formatted + " ") logging.info(f"📝 Typed: {formatted}") except Exception as e: logging.error(f"Error typing: {e}") # Clear partial text last_partial_text = "" def continuous_audio_processor(): """Background thread for continuous audio processing""" recognizer = None while True: if is_listening and recognizer is None: # Initialize recognizer when we start listening try: model = Model(MODEL_NAME) recognizer = KaldiRecognizer(model, SAMPLE_RATE) logging.info("Audio processor initialized") except Exception as e: logging.error(f"Failed to initialize recognizer: {e}") time.sleep(1) continue elif not is_listening and recognizer is not None: # Clean up when we stop listening recognizer = None logging.info("Audio processor cleaned up") time.sleep(0.1) continue if not is_listening: time.sleep(0.1) continue # Process audio when listening try: data = q.get(timeout=0.1) if recognizer: # Process partial results (real-time streaming) if recognizer.PartialResult(): partial = json.loads(recognizer.PartialResult()) partial_text = partial.get("partial", "") if partial_text: process_partial_text(partial_text) # Process final results if recognizer.AcceptWaveform(data): result = json.loads(recognizer.Result()) final_text = result.get("text", "") if final_text: process_final_text(final_text) except queue.Empty: continue except Exception as e: logging.error(f"Audio processing error: {e}") time.sleep(0.1) def show_streaming_feedback(): """Show visual feedback when dictation starts""" # Initial notification send_notification("🎤 Dictation Active", "Speak now - text will appear live!", 3000) # Brief progress notifications def progress_notification(): time.sleep(2) if is_listening: send_notification("🎤 Still Listening", "Continue speaking...", 2000) threading.Thread(target=progress_notification, daemon=True).start() def main(): try: logging.info("Starting enhanced streaming dictation") global is_listening # Model Setup download_model_if_needed() logging.info("Model ready") # Start audio processing thread audio_thread = threading.Thread(target=continuous_audio_processor, daemon=True) audio_thread.start() logging.info("Audio processor thread started") logging.info("=== Enhanced Dictation Ready ===") logging.info("Features: Real-time streaming + instant typing + visual feedback") # Open audio stream with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16', channels=1, callback=audio_callback): logging.info("Audio stream opened") while True: # Check lock file for state changes lock_exists = os.path.exists(LOCK_FILE) if lock_exists and not is_listening: is_listening = True logging.info("[Dictation] STARTED - Enhanced streaming mode") show_streaming_feedback() elif not lock_exists and is_listening: is_listening = False logging.info("[Dictation] STOPPED") send_notification("🛑 Dictation Stopped", "Press Alt+D to resume", 2000) # Sleep to prevent busy waiting time.sleep(0.05) except KeyboardInterrupt: logging.info("\nExiting...") except Exception as e: logging.error(f"Fatal error: {e}") if __name__ == "__main__": main()