dictation-service/archive/old_implementations/streaming_dictation.py

#!/mnt/storage/Development/dictation-service/.venv/bin/python
import os
import sys
import queue
import json
import time
import subprocess
import threading
import sounddevice as sd
from vosk import Model, KaldiRecognizer
from pynput.keyboard import Controller
import logging
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk, GLib

# Setup logging
logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG)

# Configuration
MODEL_NAME = "vosk-model-small-en-us-0.15" # Small model (fast)
SAMPLE_RATE = 16000
BLOCK_SIZE = 8000
LOCK_FILE = "listening.lock"

# Global State
is_listening = False
keyboard = Controller()
q = queue.Queue()
streaming_window = None
last_partial_text = ""
typing_buffer = ""

class StreamingWindow(Gtk.Window):
    """Small floating window that shows real-time transcription"""
    def __init__(self):
        super().__init__(title="Live Dictation")
        self.set_title("Live Dictation")
        self.set_default_size(400, 150)
        self.set_keep_above(True)
        self.set_decorated(True)
        self.set_resizable(True)
        self.set_position(Gtk.WindowPosition.MOUSE)

        # Set styling
        self.set_border_width(10)
        self.override_background_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(0.2, 0.2, 0.2, 0.9))

        # Create label for showing text
        self.label = Gtk.Label()
        self.label.set_text("🎤 Listening...")
        self.label.set_justify(Gtk.Justification.LEFT)
        self.label.set_line_wrap(True)
        self.label.set_max_width_chars(50)

        # Style the label
        self.label.override_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(1, 1, 1, 1))

        # Add to window
        self.add(self.label)
        self.show_all()

        logging.info("Streaming window created")

    def update_text(self, text, is_partial=False):
        """Update the window with new text"""
        GLib.idle_add(self._update_text_glib, text, is_partial)

    def _update_text_glib(self, text, is_partial):
        """Update text in main thread"""
        if is_partial:
            display_text = f"💭 {text}"
        else:
            display_text = f"✅ {text}"

        self.label.set_text(display_text)

        # Auto-hide after 3 seconds of final text
        if not is_partial and text:
            threading.Timer(3.0, self.hide_window).start()

    def hide_window(self):
        """Hide the window"""
        GLib.idle_add(self.hide)

    def close_window(self):
        """Close the window"""
        GLib.idle_add(self.destroy)

def send_notification(title, message):
    """Sends a system notification"""
    try:
        subprocess.run(["notify-send", "-t", "2000", title, message], capture_output=True)
    except FileNotFoundError:
        pass

def download_model_if_needed():
    """Checks if model exists, otherwise downloads it"""
    if not os.path.exists(MODEL_NAME):
        logging.info(f"Model '{MODEL_NAME}' not found. Downloading...")
        try:
            subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"])
            subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"])
            logging.info("Download complete.")
        except Exception as e:
            logging.error(f"Error downloading model: {e}")
            sys.exit(1)

def audio_callback(indata, frames, time, status):
    """Audio callback for processing sound"""
    if status:
        logging.warning(status)
    if is_listening:
        q.put(bytes(indata))

def process_partial_text(text):
    """Process and display partial results (streaming)"""
    global last_partial_text

    if text != last_partial_text:
        last_partial_text = text
        logging.info(f"Partial: {text}")

        # Update streaming window
        if streaming_window:
            streaming_window.update_text(text, is_partial=True)

def process_final_text(text):
    """Process and type final results"""
    global typing_buffer, last_partial_text

    if not text:
        return

    # Format text
    formatted = text.strip()
    if not formatted:
        return

    # Capitalize first letter
    formatted = formatted[0].upper() + formatted[1:]

    logging.info(f"Final: {formatted}")

    # Update streaming window
    if streaming_window:
        streaming_window.update_text(formatted, is_partial=False)

    # Type the text
    try:
        keyboard.type(formatted + " ")
        logging.info(f"Typed: {formatted}")
    except Exception as e:
        logging.error(f"Error typing: {e}")

    # Clear partial text
    last_partial_text = ""

def show_streaming_window():
    """Create and show the streaming window"""
    global streaming_window
    try:
        from gi.repository import Gdk
        Gdk.init([])

        # Run in main thread
        def create_window():
            global streaming_window
            streaming_window = StreamingWindow()

        # Use idle_add to run in main thread
        GLib.idle_add(create_window)

        # Start GTK main loop in separate thread
        def gtk_main():
            import gtk
            gtk.main()

        threading.Thread(target=gtk_main, daemon=True).start()
        time.sleep(0.5)  # Give window time to appear

    except Exception as e:
        logging.error(f"Could not create streaming window: {e}")
        # Fallback to just notifications
        send_notification("Dictation", "🎤 Listening...")

def hide_streaming_window():
    """Hide the streaming window"""
    global streaming_window
    if streaming_window:
        streaming_window.close_window()
        streaming_window = None

def main():
    try:
        logging.info("Starting enhanced streaming dictation")
        global is_listening

        # Model Setup
        download_model_if_needed()
        logging.info("Loading model...")
        model = Model(MODEL_NAME)
        recognizer = KaldiRecognizer(model, SAMPLE_RATE)
        logging.info("Model loaded successfully")

        logging.info("=== Enhanced Dictation Ready ===")
        logging.info("Features: Real-time streaming + visual feedback")

        with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16',
                               channels=1, callback=audio_callback):
            logging.info("Audio stream opened")

            while True:
                # Check lock file for state changes
                lock_exists = os.path.exists(LOCK_FILE)

                if lock_exists and not is_listening:
                    is_listening = True
                    logging.info("\n[Dictation] STARTED listening...")
                    send_notification("Dictation", "🎤 Streaming enabled")
                    show_streaming_window()

                elif not lock_exists and is_listening:
                    is_listening = False
                    logging.info("\n[Dictation] STOPPED listening.")
                    send_notification("Dictation", "🛑 Stopped")
                    hide_streaming_window()

                # If not listening, save CPU
                if not is_listening:
                    time.sleep(0.1)
                    continue

                # Process audio when listening
                try:
                    data = q.get(timeout=0.1)

                    # Check for partial results
                    if recognizer.PartialResult():
                        partial = json.loads(recognizer.PartialResult())
                        partial_text = partial.get("partial", "")
                        if partial_text:
                            process_partial_text(partial_text)

                    # Check for final results
                    if recognizer.AcceptWaveform(data):
                        result = json.loads(recognizer.Result())
                        final_text = result.get("text", "")
                        if final_text:
                            process_final_text(final_text)

                except queue.Empty:
                    pass
                except Exception as e:
                    logging.error(f"Audio processing error: {e}")

    except KeyboardInterrupt:
        logging.info("\nExiting...")
        hide_streaming_window()
    except Exception as e:
        logging.error(f"Fatal error: {e}")

if __name__ == "__main__":
    main()