#!/mnt/storage/Development/dictation-service/.venv/bin/python """ Dictation Service with System Tray Icon Provides voice-to-text transcription with visual tray icon feedback """ import os import sys import queue import json import time import subprocess import threading import sounddevice as sd from vosk import Model, KaldiRecognizer import logging import numpy as np import gi gi.require_version('Gtk', '3.0') gi.require_version('AyatanaAppIndicator3', '0.1') from gi.repository import Gtk, GLib from gi.repository import AyatanaAppIndicator3 as AppIndicator3 # Setup logging logging.basicConfig( filename=os.path.expanduser("~/.cache/dictation_service.log"), level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) # Configuration SHARED_MODELS_DIR = os.path.expanduser("~/.shared/models/vosk-models") MODEL_NAME = "vosk-model-en-us-0.22-lgraph" # Faster model with good accuracy MODEL_PATH = os.path.join(SHARED_MODELS_DIR, MODEL_NAME) SAMPLE_RATE = 16000 BLOCK_SIZE = 4000 # Smaller blocks for lower latency DICTATION_LOCK_FILE = "listening.lock" # Global State is_dictating = False q = queue.Queue() last_partial_text = "" def download_model_if_needed(): """Download model if needed""" if not os.path.exists(MODEL_PATH): logging.info(f"Model '{MODEL_PATH}' not found. Looking in shared directory...") # Check if model exists in shared models directory shared_model_path = os.path.join(SHARED_MODELS_DIR, MODEL_NAME) if os.path.exists(shared_model_path): logging.info(f"Found model in shared directory: {shared_model_path}") return logging.info(f"Model '{MODEL_NAME}' not found anywhere. Downloading...") try: # Download to shared models directory os.makedirs(SHARED_MODELS_DIR, exist_ok=True) subprocess.check_call( ["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"], cwd=SHARED_MODELS_DIR, ) subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"], cwd=SHARED_MODELS_DIR) logging.info(f"Download complete. Model installed at: {MODEL_PATH}") except Exception as e: logging.error(f"Error downloading model: {e}") sys.exit(1) else: logging.info(f"Using model at: {MODEL_PATH}") def audio_callback(indata, frames, time_info, status): """Audio callback for capturing microphone input""" if status: logging.warning(status) # Check if TTS is speaking (read-aloud service) # If so, ignore audio to prevent self-transcription if os.path.exists("/tmp/dictation_speaking.lock"): return if is_dictating: q.put(bytes(indata)) def process_partial_text(text): """Process partial text during dictation""" global last_partial_text if text and text != last_partial_text: last_partial_text = text logging.info(f"💭 {text}") def process_final_text(text): """Process final transcribed text and type it""" global last_partial_text if not text.strip(): return formatted = text.strip() # Filter out spurious single words that are likely false positives if len(formatted.split()) == 1 and formatted.lower() in [ "the", "a", "an", "uh", "huh", "um", "hmm", ]: logging.info(f"⏭️ Filtered out spurious word: {formatted}") return # Filter out very short results that are likely noise if len(formatted) < 2: logging.info(f"⏭️ Filtered out too short: {formatted}") return # Remove "the" from start and end of transcriptions (common Vosk false positive) words = formatted.split() spurious_words = {"the", "a", "an"} # Remove from start while words and words[0].lower() in spurious_words: removed = words.pop(0) logging.info(f"⏭️ Removed spurious word from start: {removed}") # Remove from end while words and words[-1].lower() in spurious_words: removed = words.pop() logging.info(f"⏭️ Removed spurious word from end: {removed}") if not words: logging.info(f"⏭️ Filtered out - only spurious words: {formatted}") return formatted = " ".join(words) formatted = formatted[0].upper() + formatted[1:] if formatted else formatted logging.info(f"✅ {formatted}") # Type the text immediately try: subprocess.run(["ydotool", "type", formatted + " "], check=False) logging.info(f"📝 Typed: {formatted}") except Exception as e: logging.error(f"Error typing: {e}") # Clear partial text last_partial_text = "" def continuous_audio_processor(): """Background thread for processing audio""" recognizer = None while True: if is_dictating and recognizer is None: # Initialize recognizer when we start listening try: model = Model(MODEL_PATH) recognizer = KaldiRecognizer(model, SAMPLE_RATE) logging.info("Audio processor initialized") except Exception as e: logging.error(f"Failed to initialize recognizer: {e}") time.sleep(1) continue elif not is_dictating and recognizer is not None: # Clean up when we stop recognizer = None logging.info("Audio processor cleaned up") time.sleep(0.1) continue if not is_dictating: time.sleep(0.1) continue # Process audio when active try: data = q.get(timeout=0.05) if recognizer: # Feed audio data to recognizer if recognizer.AcceptWaveform(data): # Final result available result = json.loads(recognizer.Result()) final_text = result.get("text", "") if final_text: logging.info(f"🎯 Final result received: {final_text}") process_final_text(final_text) else: # Check for partial results partial_result = recognizer.PartialResult() if partial_result: partial = json.loads(partial_result) partial_text = partial.get("partial", "") if partial_text: process_partial_text(partial_text) # Process additional queued audio chunks if available (batch processing) try: while True: additional_data = q.get_nowait() if recognizer.AcceptWaveform(additional_data): result = json.loads(recognizer.Result()) final_text = result.get("text", "") if final_text: logging.info(f"🎯 Final result received (batch): {final_text}") process_final_text(final_text) except queue.Empty: pass # No more data available except queue.Empty: continue except Exception as e: logging.error(f"Audio processing error: {e}") time.sleep(0.1) class DictationTrayIcon: """System tray icon for dictation control""" def __init__(self): self.indicator = AppIndicator3.Indicator.new( "dictation-service", "microphone-sensitivity-muted", # Default icon (OFF state) AppIndicator3.IndicatorCategory.APPLICATION_STATUS ) self.indicator.set_status(AppIndicator3.IndicatorStatus.ACTIVE) # Create menu self.menu = Gtk.Menu() # Status item (non-clickable) self.status_item = Gtk.MenuItem(label="Dictation: OFF") self.status_item.set_sensitive(False) self.menu.append(self.status_item) # Separator self.menu.append(Gtk.SeparatorMenuItem()) # Toggle dictation item self.toggle_item = Gtk.MenuItem(label="Toggle Dictation (Alt+D)") self.toggle_item.connect("activate", self.toggle_dictation) self.menu.append(self.toggle_item) # Separator self.menu.append(Gtk.SeparatorMenuItem()) # Quit item quit_item = Gtk.MenuItem(label="Quit Service") quit_item.connect("activate", self.quit) self.menu.append(quit_item) self.menu.show_all() self.indicator.set_menu(self.menu) # Start periodic status update GLib.timeout_add(100, self.update_status) def update_status(self): """Update tray icon based on current state""" if is_dictating: self.indicator.set_icon("microphone-sensitivity-high") # ON state self.status_item.set_label("Dictation: ON") else: self.indicator.set_icon("microphone-sensitivity-muted") # OFF state self.status_item.set_label("Dictation: OFF") return True # Continue periodic updates def toggle_dictation(self, widget): """Toggle dictation mode by creating/removing lock file""" if os.path.exists(DICTATION_LOCK_FILE): try: os.remove(DICTATION_LOCK_FILE) logging.info("Tray: Dictation toggled OFF") except Exception as e: logging.error(f"Error removing lock file: {e}") else: try: with open(DICTATION_LOCK_FILE, 'w') as f: pass logging.info("Tray: Dictation toggled ON") except Exception as e: logging.error(f"Error creating lock file: {e}") def quit(self, widget): """Quit the application""" logging.info("Quitting from tray icon") Gtk.main_quit() sys.exit(0) def audio_and_state_loop(): """Main audio and state management loop (runs in separate thread)""" global is_dictating # Model Setup download_model_if_needed() logging.info("Model ready") # Start audio processing thread audio_thread = threading.Thread(target=continuous_audio_processor, daemon=True) audio_thread.start() logging.info("Audio processor thread started") logging.info("=== Dictation Service Ready ===") try: # Open audio stream with sd.RawInputStream( samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype="int16", channels=1, callback=audio_callback, ): logging.info("Audio stream opened") while True: # Check lock file for state changes dictation_lock_exists = os.path.exists(DICTATION_LOCK_FILE) # Handle state transitions if dictation_lock_exists and not is_dictating: is_dictating = True logging.info("[Dictation] STARTED") elif not dictation_lock_exists and is_dictating: is_dictating = False logging.info("[Dictation] STOPPED") # Sleep to prevent busy waiting time.sleep(0.05) except Exception as e: logging.error(f"Fatal error in audio loop: {e}") def main(): try: logging.info("Starting dictation service with system tray") # Initialize system tray icon tray_icon = DictationTrayIcon() # Start audio and state management in separate thread audio_state_thread = threading.Thread(target=audio_and_state_loop, daemon=True) audio_state_thread.start() # Run GTK main loop (this will block) logging.info("Starting GTK main loop") Gtk.main() except KeyboardInterrupt: logging.info("\nExiting...") Gtk.main_quit() except Exception as e: logging.error(f"Fatal error: {e}") Gtk.main_quit() if __name__ == "__main__": main()