dictation-service/src/dictation_service/ai_dictation_simple.py

#!/mnt/storage/Development/dictation-service/.venv/bin/python
"""
Dictation Service with System Tray Icon
Provides voice-to-text transcription with visual tray icon feedback
"""
import os
import sys
import queue
import json
import time
import subprocess
import threading
import sounddevice as sd
from vosk import Model, KaldiRecognizer
import logging
import numpy as np
import gi
gi.require_version('Gtk', '3.0')
gi.require_version('AyatanaAppIndicator3', '0.1')
from gi.repository import Gtk, GLib
from gi.repository import AyatanaAppIndicator3 as AppIndicator3

# Setup logging
logging.basicConfig(
    filename=os.path.expanduser("~/.cache/dictation_service.log"),
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Configuration
SHARED_MODELS_DIR = os.path.expanduser("~/.shared/models/vosk-models")
MODEL_NAME = "vosk-model-en-us-0.22-lgraph"  # Faster model with good accuracy
MODEL_PATH = os.path.join(SHARED_MODELS_DIR, MODEL_NAME)
SAMPLE_RATE = 16000
BLOCK_SIZE = 4000  # Smaller blocks for lower latency
DICTATION_LOCK_FILE = "listening.lock"

# Global State
is_dictating = False
q = queue.Queue()
last_partial_text = ""


def download_model_if_needed():
    """Download model if needed"""
    if not os.path.exists(MODEL_PATH):
        logging.info(f"Model '{MODEL_PATH}' not found. Looking in shared directory...")

        # Check if model exists in shared models directory
        shared_model_path = os.path.join(SHARED_MODELS_DIR, MODEL_NAME)
        if os.path.exists(shared_model_path):
            logging.info(f"Found model in shared directory: {shared_model_path}")
            return

        logging.info(f"Model '{MODEL_NAME}' not found anywhere. Downloading...")
        try:
            # Download to shared models directory
            os.makedirs(SHARED_MODELS_DIR, exist_ok=True)
            subprocess.check_call(
                ["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"],
                cwd=SHARED_MODELS_DIR,
            )
            subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"], cwd=SHARED_MODELS_DIR)
            logging.info(f"Download complete. Model installed at: {MODEL_PATH}")
        except Exception as e:
            logging.error(f"Error downloading model: {e}")
            sys.exit(1)
    else:
        logging.info(f"Using model at: {MODEL_PATH}")


def audio_callback(indata, frames, time_info, status):
    """Audio callback for capturing microphone input"""
    if status:
        logging.warning(status)

    # Check if TTS is speaking (read-aloud service)
    # If so, ignore audio to prevent self-transcription
    if os.path.exists("/tmp/dictation_speaking.lock"):
        return

    if is_dictating:
        q.put(bytes(indata))


def process_partial_text(text):
    """Process partial text during dictation"""
    global last_partial_text

    if text and text != last_partial_text:
        last_partial_text = text
        logging.info(f"💭 {text}")


def process_final_text(text):
    """Process final transcribed text and type it"""
    global last_partial_text

    if not text.strip():
        return

    formatted = text.strip()

    # Filter out spurious single words that are likely false positives
    if len(formatted.split()) == 1 and formatted.lower() in [
        "the",
        "a",
        "an",
        "uh",
        "huh",
        "um",
        "hmm",
    ]:
        logging.info(f"⏭️  Filtered out spurious word: {formatted}")
        return

    # Filter out very short results that are likely noise
    if len(formatted) < 2:
        logging.info(f"⏭️  Filtered out too short: {formatted}")
        return

    # Remove "the" from start and end of transcriptions (common Vosk false positive)
    words = formatted.split()
    spurious_words = {"the", "a", "an"}

    # Remove from start
    while words and words[0].lower() in spurious_words:
        removed = words.pop(0)
        logging.info(f"⏭️  Removed spurious word from start: {removed}")

    # Remove from end
    while words and words[-1].lower() in spurious_words:
        removed = words.pop()
        logging.info(f"⏭️  Removed spurious word from end: {removed}")

    if not words:
        logging.info(f"⏭️  Filtered out - only spurious words: {formatted}")
        return

    formatted = " ".join(words)
    formatted = formatted[0].upper() + formatted[1:] if formatted else formatted

    logging.info(f"✅ {formatted}")

    # Type the text immediately
    try:
        subprocess.run(["ydotool", "type", formatted + " "], check=False)
        logging.info(f"📝 Typed: {formatted}")
    except Exception as e:
        logging.error(f"Error typing: {e}")

    # Clear partial text
    last_partial_text = ""


def continuous_audio_processor():
    """Background thread for processing audio"""
    recognizer = None

    while True:
        if is_dictating and recognizer is None:
            # Initialize recognizer when we start listening
            try:
                model = Model(MODEL_PATH)
                recognizer = KaldiRecognizer(model, SAMPLE_RATE)
                logging.info("Audio processor initialized")
            except Exception as e:
                logging.error(f"Failed to initialize recognizer: {e}")
                time.sleep(1)
                continue

        elif not is_dictating and recognizer is not None:
            # Clean up when we stop
            recognizer = None
            logging.info("Audio processor cleaned up")
            time.sleep(0.1)
            continue

        if not is_dictating:
            time.sleep(0.1)
            continue

        # Process audio when active
        try:
            data = q.get(timeout=0.05)

            if recognizer:
                # Feed audio data to recognizer
                if recognizer.AcceptWaveform(data):
                    # Final result available
                    result = json.loads(recognizer.Result())
                    final_text = result.get("text", "")
                    if final_text:
                        logging.info(f"🎯 Final result received: {final_text}")
                        process_final_text(final_text)
                else:
                    # Check for partial results
                    partial_result = recognizer.PartialResult()
                    if partial_result:
                        partial = json.loads(partial_result)
                        partial_text = partial.get("partial", "")
                        if partial_text:
                            process_partial_text(partial_text)

                # Process additional queued audio chunks if available (batch processing)
                try:
                    while True:
                        additional_data = q.get_nowait()
                        if recognizer.AcceptWaveform(additional_data):
                            result = json.loads(recognizer.Result())
                            final_text = result.get("text", "")
                            if final_text:
                                logging.info(f"🎯 Final result received (batch): {final_text}")
                                process_final_text(final_text)
                except queue.Empty:
                    pass  # No more data available

        except queue.Empty:
            continue
        except Exception as e:
            logging.error(f"Audio processing error: {e}")
            time.sleep(0.1)


class DictationTrayIcon:
    """System tray icon for dictation control"""

    def __init__(self):
        self.indicator = AppIndicator3.Indicator.new(
            "dictation-service",
            "microphone-sensitivity-muted",  # Default icon (OFF state)
            AppIndicator3.IndicatorCategory.APPLICATION_STATUS
        )
        self.indicator.set_status(AppIndicator3.IndicatorStatus.ACTIVE)

        # Create menu
        self.menu = Gtk.Menu()

        # Status item (non-clickable)
        self.status_item = Gtk.MenuItem(label="Dictation: OFF")
        self.status_item.set_sensitive(False)
        self.menu.append(self.status_item)

        # Separator
        self.menu.append(Gtk.SeparatorMenuItem())

        # Toggle dictation item
        self.toggle_item = Gtk.MenuItem(label="Toggle Dictation (Alt+D)")
        self.toggle_item.connect("activate", self.toggle_dictation)
        self.menu.append(self.toggle_item)

        # Separator
        self.menu.append(Gtk.SeparatorMenuItem())

        # Quit item
        quit_item = Gtk.MenuItem(label="Quit Service")
        quit_item.connect("activate", self.quit)
        self.menu.append(quit_item)

        self.menu.show_all()
        self.indicator.set_menu(self.menu)

        # Start periodic status update
        GLib.timeout_add(100, self.update_status)

    def update_status(self):
        """Update tray icon based on current state"""
        if is_dictating:
            self.indicator.set_icon("microphone-sensitivity-high")  # ON state
            self.status_item.set_label("Dictation: ON")
        else:
            self.indicator.set_icon("microphone-sensitivity-muted")  # OFF state
            self.status_item.set_label("Dictation: OFF")
        return True  # Continue periodic updates

    def toggle_dictation(self, widget):
        """Toggle dictation mode by creating/removing lock file"""
        if os.path.exists(DICTATION_LOCK_FILE):
            try:
                os.remove(DICTATION_LOCK_FILE)
                logging.info("Tray: Dictation toggled OFF")
            except Exception as e:
                logging.error(f"Error removing lock file: {e}")
        else:
            try:
                with open(DICTATION_LOCK_FILE, 'w') as f:
                    pass
                logging.info("Tray: Dictation toggled ON")
            except Exception as e:
                logging.error(f"Error creating lock file: {e}")

    def quit(self, widget):
        """Quit the application"""
        logging.info("Quitting from tray icon")
        Gtk.main_quit()
        sys.exit(0)


def audio_and_state_loop():
    """Main audio and state management loop (runs in separate thread)"""
    global is_dictating

    # Model Setup
    download_model_if_needed()
    logging.info("Model ready")

    # Start audio processing thread
    audio_thread = threading.Thread(target=continuous_audio_processor, daemon=True)
    audio_thread.start()
    logging.info("Audio processor thread started")

    logging.info("=== Dictation Service Ready ===")

    try:
        # Open audio stream
        with sd.RawInputStream(
            samplerate=SAMPLE_RATE,
            blocksize=BLOCK_SIZE,
            dtype="int16",
            channels=1,
            callback=audio_callback,
        ):
            logging.info("Audio stream opened")

            while True:
                # Check lock file for state changes
                dictation_lock_exists = os.path.exists(DICTATION_LOCK_FILE)

                # Handle state transitions
                if dictation_lock_exists and not is_dictating:
                    is_dictating = True
                    logging.info("[Dictation] STARTED")
                elif not dictation_lock_exists and is_dictating:
                    is_dictating = False
                    logging.info("[Dictation] STOPPED")

                # Sleep to prevent busy waiting
                time.sleep(0.05)

    except Exception as e:
        logging.error(f"Fatal error in audio loop: {e}")


def main():
    try:
        logging.info("Starting dictation service with system tray")

        # Initialize system tray icon
        tray_icon = DictationTrayIcon()

        # Start audio and state management in separate thread
        audio_state_thread = threading.Thread(target=audio_and_state_loop, daemon=True)
        audio_state_thread.start()

        # Run GTK main loop (this will block)
        logging.info("Starting GTK main loop")
        Gtk.main()

    except KeyboardInterrupt:
        logging.info("\nExiting...")
        Gtk.main_quit()
    except Exception as e:
        logging.error(f"Fatal error: {e}")
        Gtk.main_quit()


if __name__ == "__main__":
    main()