This is a comprehensive refactoring that transforms the dictation service from a complex multi-mode application into two clean, focused features: 1. Voice dictation with system tray icon 2. On-demand read-aloud via Ctrl+middle-click ## Key Changes ### Dictation Service Enhancements - Add GTK/AppIndicator3 system tray icon for visual status - Remove all notification spam (dictation start/stop/status) - Icon states: microphone-muted (OFF) → microphone-high (ON) - Click tray icon to toggle dictation (same as Alt+D) - Simplify ai_dictation_simple.py by removing conversation mode ### Read-Aloud Service Redesign - Replace automatic clipboard reader with on-demand Ctrl+middle-click - New middle_click_reader.py service - Works anywhere: highlight text, Ctrl+middle-click to read - Uses Edge-TTS (Christopher voice) with mpv playback - Lock file prevents feedback with dictation service ### Conversation Mode Removed - Delete all VLLM/conversation code (VLLMClient, ConversationManager, TTS) - Archive 5 old implementations to archive/old_implementations/ - Remove conversation-related scripts and services - Clean separation of concerns for future reintegration if needed ### Dependencies Cleanup - Remove: openai, aiohttp, pyttsx3, requests (conversation deps) - Keep: PyGObject, pynput, sounddevice, vosk, numpy, edge-tts - Net reduction: 4 packages removed, 6 core packages retained ### Testing Improvements - Add test_dictation_service.py (8 tests) ✅ - Add test_middle_click.py (11 tests) ✅ - Fix test_run.py to use correct model path - Total: 19 unit tests passing - Delete obsolete test files (test_suite, test_vllm_integration, etc.) ### Documentation - Add CHANGES.md with complete changelog - Add docs/MIGRATION_GUIDE.md for upgrading - Add README.md with quick start guide - Update docs/README.md with current features only - Add justfile for common tasks ### New Services & Scripts - Add middle-click-reader.service (systemd) - Add scripts/setup-middle-click-reader.sh - Add desktop files for autostart - Remove toggle-conversation.sh (obsolete) ## Impact **Code Quality** - Net change: -6,007 lines (596 added, 6,603 deleted) - Simpler architecture, easier maintenance - Better test coverage (19 tests vs mixed before) - Cleaner separation of concerns **User Experience** - No notification spam during dictation - Clean visual status via tray icon - Full control over read-aloud (no unwanted readings) - Better performance (fewer background processes) **Privacy** - No conversation data stored - No VLLM connection needed - All processing local except Edge-TTS text ## Migration Notes Users upgrading should: 1. Run `uv sync` to update dependencies 2. Restart dictation.service to get tray icon 3. Run scripts/setup-middle-click-reader.sh for new read-aloud 4. Remove old read-aloud.service if present See docs/MIGRATION_GUIDE.md for details. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
369 lines
12 KiB
Python
369 lines
12 KiB
Python
#!/mnt/storage/Development/dictation-service/.venv/bin/python
|
|
"""
|
|
Dictation Service with System Tray Icon
|
|
Provides voice-to-text transcription with visual tray icon feedback
|
|
"""
|
|
import os
|
|
import sys
|
|
import queue
|
|
import json
|
|
import time
|
|
import subprocess
|
|
import threading
|
|
import sounddevice as sd
|
|
from vosk import Model, KaldiRecognizer
|
|
import logging
|
|
import numpy as np
|
|
import gi
|
|
gi.require_version('Gtk', '3.0')
|
|
gi.require_version('AyatanaAppIndicator3', '0.1')
|
|
from gi.repository import Gtk, GLib
|
|
from gi.repository import AyatanaAppIndicator3 as AppIndicator3
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
filename=os.path.expanduser("~/.cache/dictation_service.log"),
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
# Configuration
|
|
SHARED_MODELS_DIR = os.path.expanduser("~/.shared/models/vosk-models")
|
|
MODEL_NAME = "vosk-model-en-us-0.22-lgraph" # Faster model with good accuracy
|
|
MODEL_PATH = os.path.join(SHARED_MODELS_DIR, MODEL_NAME)
|
|
SAMPLE_RATE = 16000
|
|
BLOCK_SIZE = 4000 # Smaller blocks for lower latency
|
|
DICTATION_LOCK_FILE = "listening.lock"
|
|
|
|
# Global State
|
|
is_dictating = False
|
|
q = queue.Queue()
|
|
last_partial_text = ""
|
|
|
|
|
|
def download_model_if_needed():
|
|
"""Download model if needed"""
|
|
if not os.path.exists(MODEL_PATH):
|
|
logging.info(f"Model '{MODEL_PATH}' not found. Looking in shared directory...")
|
|
|
|
# Check if model exists in shared models directory
|
|
shared_model_path = os.path.join(SHARED_MODELS_DIR, MODEL_NAME)
|
|
if os.path.exists(shared_model_path):
|
|
logging.info(f"Found model in shared directory: {shared_model_path}")
|
|
return
|
|
|
|
logging.info(f"Model '{MODEL_NAME}' not found anywhere. Downloading...")
|
|
try:
|
|
# Download to shared models directory
|
|
os.makedirs(SHARED_MODELS_DIR, exist_ok=True)
|
|
subprocess.check_call(
|
|
["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"],
|
|
cwd=SHARED_MODELS_DIR,
|
|
)
|
|
subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"], cwd=SHARED_MODELS_DIR)
|
|
logging.info(f"Download complete. Model installed at: {MODEL_PATH}")
|
|
except Exception as e:
|
|
logging.error(f"Error downloading model: {e}")
|
|
sys.exit(1)
|
|
else:
|
|
logging.info(f"Using model at: {MODEL_PATH}")
|
|
|
|
|
|
def audio_callback(indata, frames, time_info, status):
|
|
"""Audio callback for capturing microphone input"""
|
|
if status:
|
|
logging.warning(status)
|
|
|
|
# Check if TTS is speaking (read-aloud service)
|
|
# If so, ignore audio to prevent self-transcription
|
|
if os.path.exists("/tmp/dictation_speaking.lock"):
|
|
return
|
|
|
|
if is_dictating:
|
|
q.put(bytes(indata))
|
|
|
|
|
|
def process_partial_text(text):
|
|
"""Process partial text during dictation"""
|
|
global last_partial_text
|
|
|
|
if text and text != last_partial_text:
|
|
last_partial_text = text
|
|
logging.info(f"💭 {text}")
|
|
|
|
|
|
def process_final_text(text):
|
|
"""Process final transcribed text and type it"""
|
|
global last_partial_text
|
|
|
|
if not text.strip():
|
|
return
|
|
|
|
formatted = text.strip()
|
|
|
|
# Filter out spurious single words that are likely false positives
|
|
if len(formatted.split()) == 1 and formatted.lower() in [
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"uh",
|
|
"huh",
|
|
"um",
|
|
"hmm",
|
|
]:
|
|
logging.info(f"⏭️ Filtered out spurious word: {formatted}")
|
|
return
|
|
|
|
# Filter out very short results that are likely noise
|
|
if len(formatted) < 2:
|
|
logging.info(f"⏭️ Filtered out too short: {formatted}")
|
|
return
|
|
|
|
# Remove "the" from start and end of transcriptions (common Vosk false positive)
|
|
words = formatted.split()
|
|
spurious_words = {"the", "a", "an"}
|
|
|
|
# Remove from start
|
|
while words and words[0].lower() in spurious_words:
|
|
removed = words.pop(0)
|
|
logging.info(f"⏭️ Removed spurious word from start: {removed}")
|
|
|
|
# Remove from end
|
|
while words and words[-1].lower() in spurious_words:
|
|
removed = words.pop()
|
|
logging.info(f"⏭️ Removed spurious word from end: {removed}")
|
|
|
|
if not words:
|
|
logging.info(f"⏭️ Filtered out - only spurious words: {formatted}")
|
|
return
|
|
|
|
formatted = " ".join(words)
|
|
formatted = formatted[0].upper() + formatted[1:] if formatted else formatted
|
|
|
|
logging.info(f"✅ {formatted}")
|
|
|
|
# Type the text immediately
|
|
try:
|
|
subprocess.run(["ydotool", "type", formatted + " "], check=False)
|
|
logging.info(f"📝 Typed: {formatted}")
|
|
except Exception as e:
|
|
logging.error(f"Error typing: {e}")
|
|
|
|
# Clear partial text
|
|
last_partial_text = ""
|
|
|
|
|
|
def continuous_audio_processor():
|
|
"""Background thread for processing audio"""
|
|
recognizer = None
|
|
|
|
while True:
|
|
if is_dictating and recognizer is None:
|
|
# Initialize recognizer when we start listening
|
|
try:
|
|
model = Model(MODEL_PATH)
|
|
recognizer = KaldiRecognizer(model, SAMPLE_RATE)
|
|
logging.info("Audio processor initialized")
|
|
except Exception as e:
|
|
logging.error(f"Failed to initialize recognizer: {e}")
|
|
time.sleep(1)
|
|
continue
|
|
|
|
elif not is_dictating and recognizer is not None:
|
|
# Clean up when we stop
|
|
recognizer = None
|
|
logging.info("Audio processor cleaned up")
|
|
time.sleep(0.1)
|
|
continue
|
|
|
|
if not is_dictating:
|
|
time.sleep(0.1)
|
|
continue
|
|
|
|
# Process audio when active
|
|
try:
|
|
data = q.get(timeout=0.05)
|
|
|
|
if recognizer:
|
|
# Feed audio data to recognizer
|
|
if recognizer.AcceptWaveform(data):
|
|
# Final result available
|
|
result = json.loads(recognizer.Result())
|
|
final_text = result.get("text", "")
|
|
if final_text:
|
|
logging.info(f"🎯 Final result received: {final_text}")
|
|
process_final_text(final_text)
|
|
else:
|
|
# Check for partial results
|
|
partial_result = recognizer.PartialResult()
|
|
if partial_result:
|
|
partial = json.loads(partial_result)
|
|
partial_text = partial.get("partial", "")
|
|
if partial_text:
|
|
process_partial_text(partial_text)
|
|
|
|
# Process additional queued audio chunks if available (batch processing)
|
|
try:
|
|
while True:
|
|
additional_data = q.get_nowait()
|
|
if recognizer.AcceptWaveform(additional_data):
|
|
result = json.loads(recognizer.Result())
|
|
final_text = result.get("text", "")
|
|
if final_text:
|
|
logging.info(f"🎯 Final result received (batch): {final_text}")
|
|
process_final_text(final_text)
|
|
except queue.Empty:
|
|
pass # No more data available
|
|
|
|
except queue.Empty:
|
|
continue
|
|
except Exception as e:
|
|
logging.error(f"Audio processing error: {e}")
|
|
time.sleep(0.1)
|
|
|
|
|
|
class DictationTrayIcon:
|
|
"""System tray icon for dictation control"""
|
|
|
|
def __init__(self):
|
|
self.indicator = AppIndicator3.Indicator.new(
|
|
"dictation-service",
|
|
"microphone-sensitivity-muted", # Default icon (OFF state)
|
|
AppIndicator3.IndicatorCategory.APPLICATION_STATUS
|
|
)
|
|
self.indicator.set_status(AppIndicator3.IndicatorStatus.ACTIVE)
|
|
|
|
# Create menu
|
|
self.menu = Gtk.Menu()
|
|
|
|
# Status item (non-clickable)
|
|
self.status_item = Gtk.MenuItem(label="Dictation: OFF")
|
|
self.status_item.set_sensitive(False)
|
|
self.menu.append(self.status_item)
|
|
|
|
# Separator
|
|
self.menu.append(Gtk.SeparatorMenuItem())
|
|
|
|
# Toggle dictation item
|
|
self.toggle_item = Gtk.MenuItem(label="Toggle Dictation (Alt+D)")
|
|
self.toggle_item.connect("activate", self.toggle_dictation)
|
|
self.menu.append(self.toggle_item)
|
|
|
|
# Separator
|
|
self.menu.append(Gtk.SeparatorMenuItem())
|
|
|
|
# Quit item
|
|
quit_item = Gtk.MenuItem(label="Quit Service")
|
|
quit_item.connect("activate", self.quit)
|
|
self.menu.append(quit_item)
|
|
|
|
self.menu.show_all()
|
|
self.indicator.set_menu(self.menu)
|
|
|
|
# Start periodic status update
|
|
GLib.timeout_add(100, self.update_status)
|
|
|
|
def update_status(self):
|
|
"""Update tray icon based on current state"""
|
|
if is_dictating:
|
|
self.indicator.set_icon("microphone-sensitivity-high") # ON state
|
|
self.status_item.set_label("Dictation: ON")
|
|
else:
|
|
self.indicator.set_icon("microphone-sensitivity-muted") # OFF state
|
|
self.status_item.set_label("Dictation: OFF")
|
|
return True # Continue periodic updates
|
|
|
|
def toggle_dictation(self, widget):
|
|
"""Toggle dictation mode by creating/removing lock file"""
|
|
if os.path.exists(DICTATION_LOCK_FILE):
|
|
try:
|
|
os.remove(DICTATION_LOCK_FILE)
|
|
logging.info("Tray: Dictation toggled OFF")
|
|
except Exception as e:
|
|
logging.error(f"Error removing lock file: {e}")
|
|
else:
|
|
try:
|
|
with open(DICTATION_LOCK_FILE, 'w') as f:
|
|
pass
|
|
logging.info("Tray: Dictation toggled ON")
|
|
except Exception as e:
|
|
logging.error(f"Error creating lock file: {e}")
|
|
|
|
def quit(self, widget):
|
|
"""Quit the application"""
|
|
logging.info("Quitting from tray icon")
|
|
Gtk.main_quit()
|
|
sys.exit(0)
|
|
|
|
|
|
def audio_and_state_loop():
|
|
"""Main audio and state management loop (runs in separate thread)"""
|
|
global is_dictating
|
|
|
|
# Model Setup
|
|
download_model_if_needed()
|
|
logging.info("Model ready")
|
|
|
|
# Start audio processing thread
|
|
audio_thread = threading.Thread(target=continuous_audio_processor, daemon=True)
|
|
audio_thread.start()
|
|
logging.info("Audio processor thread started")
|
|
|
|
logging.info("=== Dictation Service Ready ===")
|
|
|
|
try:
|
|
# Open audio stream
|
|
with sd.RawInputStream(
|
|
samplerate=SAMPLE_RATE,
|
|
blocksize=BLOCK_SIZE,
|
|
dtype="int16",
|
|
channels=1,
|
|
callback=audio_callback,
|
|
):
|
|
logging.info("Audio stream opened")
|
|
|
|
while True:
|
|
# Check lock file for state changes
|
|
dictation_lock_exists = os.path.exists(DICTATION_LOCK_FILE)
|
|
|
|
# Handle state transitions
|
|
if dictation_lock_exists and not is_dictating:
|
|
is_dictating = True
|
|
logging.info("[Dictation] STARTED")
|
|
elif not dictation_lock_exists and is_dictating:
|
|
is_dictating = False
|
|
logging.info("[Dictation] STOPPED")
|
|
|
|
# Sleep to prevent busy waiting
|
|
time.sleep(0.05)
|
|
|
|
except Exception as e:
|
|
logging.error(f"Fatal error in audio loop: {e}")
|
|
|
|
|
|
def main():
|
|
try:
|
|
logging.info("Starting dictation service with system tray")
|
|
|
|
# Initialize system tray icon
|
|
tray_icon = DictationTrayIcon()
|
|
|
|
# Start audio and state management in separate thread
|
|
audio_state_thread = threading.Thread(target=audio_and_state_loop, daemon=True)
|
|
audio_state_thread.start()
|
|
|
|
# Run GTK main loop (this will block)
|
|
logging.info("Starting GTK main loop")
|
|
Gtk.main()
|
|
|
|
except KeyboardInterrupt:
|
|
logging.info("\nExiting...")
|
|
Gtk.main_quit()
|
|
except Exception as e:
|
|
logging.error(f"Fatal error: {e}")
|
|
Gtk.main_quit()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|