#!/mnt/storage/Development/dictation-service/.venv/bin/python import os import sys import queue import json import time import subprocess import threading import sounddevice as sd from vosk import Model, KaldiRecognizer from pynput.keyboard import Controller import logging logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG) # Configuration MODEL_NAME = "vosk-model-small-en-us-0.15" # Small model (fast) # MODEL_NAME = "vosk-model-en-us-0.22" # Larger model (more accurate, higher RAM) SAMPLE_RATE = 16000 BLOCK_SIZE = 8000 LOCK_FILE = "listening.lock" # Global State is_listening = False keyboard = Controller() q = queue.Queue() def send_notification(title, message): """Sends a system notification to let the user know state changed.""" try: subprocess.run(["notify-send", "-t", "2000", title, message]) except FileNotFoundError: pass # notify-send might not be installed def download_model_if_needed(): """Checks if model exists, otherwise downloads the small English model.""" if not os.path.exists(MODEL_NAME): logging.info(f"Model '{MODEL_NAME}' not found.") logging.info("Downloading default model (approx 40MB)...") try: # Requires requests and zipfile, simplified here to system call for robustness subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"]) subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"]) logging.info("Download complete.") except Exception as e: logging.error(f"Error downloading model: {e}") sys.exit(1) def audio_callback(indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" if status: logging.warning(status) if is_listening: q.put(bytes(indata)) def process_text(text): """Formats text slightly before typing (capitalization).""" if not text: return "" # Basic Sentence Case formatted = text[0].upper() + text[1:] return formatted + " " def main(): try: logging.info("Starting main function") global is_listening # 2. Model Setup download_model_if_needed() logging.info("Model check complete") logging.info("Loading model... (this may take a moment)") try: model = Model(MODEL_NAME) logging.info("Model loaded successfully") except Exception as e: logging.error(f"Failed to load model: {e}") sys.exit(1) recognizer = KaldiRecognizer(model, SAMPLE_RATE) logging.info("Recognizer created") logging.info("\n=== Ready ===") logging.info("Waiting for lock file to start dictation...") # 3. Main Audio Loop # We use raw input stream to keep latency low try: with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16', channels=1, callback=audio_callback): logging.info("Audio stream opened") while True: # If lock file exists, start listening if os.path.exists(LOCK_FILE) and not is_listening: is_listening = True logging.info("\n[Dictation] STARTED listening...") send_notification("Dictation", "🎤 Listening...") # If lock file does not exist, stop listening elif not os.path.exists(LOCK_FILE) and is_listening: is_listening = False logging.info("\n[Dictation] STOPPED listening.") send_notification("Dictation", "🛑 Stopped.") # If not listening, just sleep to save CPU if not is_listening: time.sleep(0.1) continue # If listening, process the queue try: data = q.get(timeout=0.1) if recognizer.AcceptWaveform(data): result = json.loads(recognizer.Result()) text = result.get("text", "") if text: typed_text = process_text(text) logging.info(f"Typing: {text}") keyboard.type(typed_text) except queue.Empty: pass except KeyboardInterrupt: logging.info("\nExiting...") except Exception as e: logging.error(f"\nError in audio loop: {e}") except Exception as e: logging.error(f"Error in main function: {e}") if __name__ == "__main__": main()