dictation-service/archive/old_implementations/enhanced_dictation.py

#!/mnt/storage/Development/dictation-service/.venv/bin/python
import os
import sys
import queue
import json
import time
import subprocess
import threading
import sounddevice as sd
from vosk import Model, KaldiRecognizer
from pynput.keyboard import Controller
import logging

# Setup logging
logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG)

# Configuration
MODEL_NAME = "vosk-model-en-us-0.22"
SAMPLE_RATE = 16000
BLOCK_SIZE = 8000
LOCK_FILE = "listening.lock"

# Global State
is_listening = False
keyboard = Controller()
q = queue.Queue()
last_partial_text = ""
typing_thread = None
should_type = False

def send_notification(title, message, duration=2000):
    """Sends a system notification"""
    try:
        subprocess.run(["notify-send", "-t", str(duration), "-u", "low", title, message],
                      capture_output=True, check=True)
    except (FileNotFoundError, subprocess.CalledProcessError):
        pass

def download_model_if_needed():
    """Download model if needed"""
    if not os.path.exists(MODEL_NAME):
        logging.info(f"Model '{MODEL_NAME}' not found. Downloading...")
        try:
            subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"])
            subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"])
            logging.info("Download complete.")
        except Exception as e:
            logging.error(f"Error downloading model: {e}")
            sys.exit(1)

def audio_callback(indata, frames, time, status):
    """Audio callback"""
    if status:
        logging.warning(status)
    if is_listening:
        q.put(bytes(indata))

def process_partial_text(text):
    """Process and display partial results with real-time feedback"""
    global last_partial_text

    if text and text != last_partial_text:
        last_partial_text = text
        logging.info(f"💭 {text}")

        # Show brief notification for longer partial text
        if len(text) > 3:
            send_notification("🎤 Speaking", text[:50] + "..." if len(text) > 50 else text, 1000)

def process_final_text(text):
    """Process and type final results immediately"""
    global last_partial_text, should_type

    if not text.strip():
        return

    # Format and clean text
    formatted = text.strip()

    # Filter out spurious single words that are likely false positives
    if len(formatted.split()) == 1 and formatted.lower() in ['the', 'a', 'an', 'uh', 'huh', 'um', 'hmm']:
        logging.info(f"⏭️  Filtered out spurious word: {formatted}")
        return

    # Filter out very short results that are likely noise
    if len(formatted) < 2:
        logging.info(f"⏭️  Filtered out too short: {formatted}")
        return

    formatted = formatted[0].upper() + formatted[1:] if formatted else formatted

    logging.info(f"✅ {formatted}")

    # Show final result notification briefly
    send_notification("✅ Said", formatted, 1500)

    # Type the text immediately
    try:
        keyboard.type(formatted + " ")
        logging.info(f"📝 Typed: {formatted}")
    except Exception as e:
        logging.error(f"Error typing: {e}")

    # Clear partial text
    last_partial_text = ""

def continuous_audio_processor():
    """Background thread for continuous audio processing"""
    recognizer = None

    while True:
        if is_listening and recognizer is None:
            # Initialize recognizer when we start listening
            try:
                model = Model(MODEL_NAME)
                recognizer = KaldiRecognizer(model, SAMPLE_RATE)
                logging.info("Audio processor initialized")
            except Exception as e:
                logging.error(f"Failed to initialize recognizer: {e}")
                time.sleep(1)
                continue

        elif not is_listening and recognizer is not None:
            # Clean up when we stop listening
            recognizer = None
            logging.info("Audio processor cleaned up")
            time.sleep(0.1)
            continue

        if not is_listening:
            time.sleep(0.1)
            continue

        # Process audio when listening
        try:
            data = q.get(timeout=0.1)

            if recognizer:
                # Process partial results (real-time streaming)
                if recognizer.PartialResult():
                    partial = json.loads(recognizer.PartialResult())
                    partial_text = partial.get("partial", "")
                    if partial_text:
                        process_partial_text(partial_text)

                # Process final results
                if recognizer.AcceptWaveform(data):
                    result = json.loads(recognizer.Result())
                    final_text = result.get("text", "")
                    if final_text:
                        process_final_text(final_text)

        except queue.Empty:
            continue
        except Exception as e:
            logging.error(f"Audio processing error: {e}")
            time.sleep(0.1)

def show_streaming_feedback():
    """Show visual feedback when dictation starts"""
    # Initial notification
    send_notification("🎤 Dictation Active", "Speak now - text will appear live!", 3000)

    # Brief progress notifications
    def progress_notification():
        time.sleep(2)
        if is_listening:
            send_notification("🎤 Still Listening", "Continue speaking...", 2000)

    threading.Thread(target=progress_notification, daemon=True).start()

def main():
    try:
        logging.info("Starting enhanced streaming dictation")
        global is_listening

        # Model Setup
        download_model_if_needed()
        logging.info("Model ready")

        # Start audio processing thread
        audio_thread = threading.Thread(target=continuous_audio_processor, daemon=True)
        audio_thread.start()
        logging.info("Audio processor thread started")

        logging.info("=== Enhanced Dictation Ready ===")
        logging.info("Features: Real-time streaming + instant typing + visual feedback")

        # Open audio stream
        with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16',
                               channels=1, callback=audio_callback):
            logging.info("Audio stream opened")

            while True:
                # Check lock file for state changes
                lock_exists = os.path.exists(LOCK_FILE)

                if lock_exists and not is_listening:
                    is_listening = True
                    logging.info("[Dictation] STARTED - Enhanced streaming mode")
                    show_streaming_feedback()

                elif not lock_exists and is_listening:
                    is_listening = False
                    logging.info("[Dictation] STOPPED")
                    send_notification("🛑 Dictation Stopped", "Press Alt+D to resume", 2000)

                # Sleep to prevent busy waiting
                time.sleep(0.05)

    except KeyboardInterrupt:
        logging.info("\nExiting...")
    except Exception as e:
        logging.error(f"Fatal error: {e}")

if __name__ == "__main__":
    main()