dictation-service/archive/old_implementations/vosk_dictation.py

#!/mnt/storage/Development/dictation-service/.venv/bin/python
import os
import sys
import queue
import json
import time
import subprocess
import threading
import sounddevice as sd
from vosk import Model, KaldiRecognizer
from pynput.keyboard import Controller
import logging

logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG)

# Configuration
MODEL_NAME = "vosk-model-small-en-us-0.15" # Small model (fast)
# MODEL_NAME = "vosk-model-en-us-0.22"     # Larger model (more accurate, higher RAM)
SAMPLE_RATE = 16000
BLOCK_SIZE = 8000
LOCK_FILE = "listening.lock"

# Global State
is_listening = False
keyboard = Controller()
q = queue.Queue()

def send_notification(title, message):
    """Sends a system notification to let the user know state changed."""
    try:
        subprocess.run(["notify-send", "-t", "2000", title, message])
    except FileNotFoundError:
        pass # notify-send might not be installed

def download_model_if_needed():
    """Checks if model exists, otherwise downloads the small English model."""
    if not os.path.exists(MODEL_NAME):
        logging.info(f"Model '{MODEL_NAME}' not found.")
        logging.info("Downloading default model (approx 40MB)...")
        try:
            # Requires requests and zipfile, simplified here to system call for robustness
            subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"])
            subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"])
            logging.info("Download complete.")
        except Exception as e:
            logging.error(f"Error downloading model: {e}")
            sys.exit(1)

def audio_callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        logging.warning(status)
    if is_listening:
        q.put(bytes(indata))

def process_text(text):
    """Formats text slightly before typing (capitalization)."""
    if not text:
        return ""
    # Basic Sentence Case
    formatted = text[0].upper() + text[1:]
    return formatted + " "

def main():
    try:
        logging.info("Starting main function")
        global is_listening

        # 2. Model Setup
        download_model_if_needed()
        logging.info("Model check complete")
        logging.info("Loading model... (this may take a moment)")
        try:
            model = Model(MODEL_NAME)
            logging.info("Model loaded successfully")
        except Exception as e:
            logging.error(f"Failed to load model: {e}")
            sys.exit(1)

        recognizer = KaldiRecognizer(model, SAMPLE_RATE)
        logging.info("Recognizer created")

        logging.info("\n=== Ready ===")
        logging.info("Waiting for lock file to start dictation...")

        # 3. Main Audio Loop
        # We use raw input stream to keep latency low
        try:
            with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16',
                                   channels=1, callback=audio_callback):
                logging.info("Audio stream opened")
                while True:
                    # If lock file exists, start listening
                    if os.path.exists(LOCK_FILE) and not is_listening:
                        is_listening = True
                        logging.info("\n[Dictation] STARTED listening...")
                        send_notification("Dictation", "🎤 Listening...")

                    # If lock file does not exist, stop listening
                    elif not os.path.exists(LOCK_FILE) and is_listening:
                        is_listening = False
                        logging.info("\n[Dictation] STOPPED listening.")
                        send_notification("Dictation", "🛑 Stopped.")

                    # If not listening, just sleep to save CPU
                    if not is_listening:
                        time.sleep(0.1)
                        continue

                    # If listening, process the queue
                    try:
                        data = q.get(timeout=0.1)
                        if recognizer.AcceptWaveform(data):
                            result = json.loads(recognizer.Result())
                            text = result.get("text", "")
                            if text:
                                typed_text = process_text(text)
                                logging.info(f"Typing: {text}")
                                keyboard.type(typed_text)
                    except queue.Empty:
                        pass

        except KeyboardInterrupt:
            logging.info("\nExiting...")
        except Exception as e:
            logging.error(f"\nError in audio loop: {e}")
    except Exception as e:
        logging.error(f"Error in main function: {e}")

if __name__ == "__main__":
    main()