#!/mnt/storage/Development/dictation-service/.venv/bin/python import os import sys import queue import json import time import subprocess import threading import sounddevice as sd from vosk import Model, KaldiRecognizer from pynput.keyboard import Controller import logging import gi gi.require_version('Gtk', '3.0') from gi.repository import Gtk, GLib # Setup logging logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG) # Configuration MODEL_NAME = "vosk-model-small-en-us-0.15" # Small model (fast) SAMPLE_RATE = 16000 BLOCK_SIZE = 8000 LOCK_FILE = "listening.lock" # Global State is_listening = False keyboard = Controller() q = queue.Queue() streaming_window = None last_partial_text = "" typing_buffer = "" class StreamingWindow(Gtk.Window): """Small floating window that shows real-time transcription""" def __init__(self): super().__init__(title="Live Dictation") self.set_title("Live Dictation") self.set_default_size(400, 150) self.set_keep_above(True) self.set_decorated(True) self.set_resizable(True) self.set_position(Gtk.WindowPosition.MOUSE) # Set styling self.set_border_width(10) self.override_background_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(0.2, 0.2, 0.2, 0.9)) # Create label for showing text self.label = Gtk.Label() self.label.set_text("🎤 Listening...") self.label.set_justify(Gtk.Justification.LEFT) self.label.set_line_wrap(True) self.label.set_max_width_chars(50) # Style the label self.label.override_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(1, 1, 1, 1)) # Add to window self.add(self.label) self.show_all() logging.info("Streaming window created") def update_text(self, text, is_partial=False): """Update the window with new text""" GLib.idle_add(self._update_text_glib, text, is_partial) def _update_text_glib(self, text, is_partial): """Update text in main thread""" if is_partial: display_text = f"💭 {text}" else: display_text = f"✅ {text}" self.label.set_text(display_text) # Auto-hide after 3 seconds of final text if not is_partial and text: threading.Timer(3.0, self.hide_window).start() def hide_window(self): """Hide the window""" GLib.idle_add(self.hide) def close_window(self): """Close the window""" GLib.idle_add(self.destroy) def send_notification(title, message): """Sends a system notification""" try: subprocess.run(["notify-send", "-t", "2000", title, message], capture_output=True) except FileNotFoundError: pass def download_model_if_needed(): """Checks if model exists, otherwise downloads it""" if not os.path.exists(MODEL_NAME): logging.info(f"Model '{MODEL_NAME}' not found. Downloading...") try: subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"]) subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"]) logging.info("Download complete.") except Exception as e: logging.error(f"Error downloading model: {e}") sys.exit(1) def audio_callback(indata, frames, time, status): """Audio callback for processing sound""" if status: logging.warning(status) if is_listening: q.put(bytes(indata)) def process_partial_text(text): """Process and display partial results (streaming)""" global last_partial_text if text != last_partial_text: last_partial_text = text logging.info(f"Partial: {text}") # Update streaming window if streaming_window: streaming_window.update_text(text, is_partial=True) def process_final_text(text): """Process and type final results""" global typing_buffer, last_partial_text if not text: return # Format text formatted = text.strip() if not formatted: return # Capitalize first letter formatted = formatted[0].upper() + formatted[1:] logging.info(f"Final: {formatted}") # Update streaming window if streaming_window: streaming_window.update_text(formatted, is_partial=False) # Type the text try: keyboard.type(formatted + " ") logging.info(f"Typed: {formatted}") except Exception as e: logging.error(f"Error typing: {e}") # Clear partial text last_partial_text = "" def show_streaming_window(): """Create and show the streaming window""" global streaming_window try: from gi.repository import Gdk Gdk.init([]) # Run in main thread def create_window(): global streaming_window streaming_window = StreamingWindow() # Use idle_add to run in main thread GLib.idle_add(create_window) # Start GTK main loop in separate thread def gtk_main(): import gtk gtk.main() threading.Thread(target=gtk_main, daemon=True).start() time.sleep(0.5) # Give window time to appear except Exception as e: logging.error(f"Could not create streaming window: {e}") # Fallback to just notifications send_notification("Dictation", "🎤 Listening...") def hide_streaming_window(): """Hide the streaming window""" global streaming_window if streaming_window: streaming_window.close_window() streaming_window = None def main(): try: logging.info("Starting enhanced streaming dictation") global is_listening # Model Setup download_model_if_needed() logging.info("Loading model...") model = Model(MODEL_NAME) recognizer = KaldiRecognizer(model, SAMPLE_RATE) logging.info("Model loaded successfully") logging.info("=== Enhanced Dictation Ready ===") logging.info("Features: Real-time streaming + visual feedback") with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16', channels=1, callback=audio_callback): logging.info("Audio stream opened") while True: # Check lock file for state changes lock_exists = os.path.exists(LOCK_FILE) if lock_exists and not is_listening: is_listening = True logging.info("\n[Dictation] STARTED listening...") send_notification("Dictation", "🎤 Streaming enabled") show_streaming_window() elif not lock_exists and is_listening: is_listening = False logging.info("\n[Dictation] STOPPED listening.") send_notification("Dictation", "🛑 Stopped") hide_streaming_window() # If not listening, save CPU if not is_listening: time.sleep(0.1) continue # Process audio when listening try: data = q.get(timeout=0.1) # Check for partial results if recognizer.PartialResult(): partial = json.loads(recognizer.PartialResult()) partial_text = partial.get("partial", "") if partial_text: process_partial_text(partial_text) # Check for final results if recognizer.AcceptWaveform(data): result = json.loads(recognizer.Result()) final_text = result.get("text", "") if final_text: process_final_text(final_text) except queue.Empty: pass except Exception as e: logging.error(f"Audio processing error: {e}") except KeyboardInterrupt: logging.info("\nExiting...") hide_streaming_window() except Exception as e: logging.error(f"Fatal error: {e}") if __name__ == "__main__": main()