This is a comprehensive refactoring that transforms the dictation service from a complex multi-mode application into two clean, focused features: 1. Voice dictation with system tray icon 2. On-demand read-aloud via Ctrl+middle-click ## Key Changes ### Dictation Service Enhancements - Add GTK/AppIndicator3 system tray icon for visual status - Remove all notification spam (dictation start/stop/status) - Icon states: microphone-muted (OFF) → microphone-high (ON) - Click tray icon to toggle dictation (same as Alt+D) - Simplify ai_dictation_simple.py by removing conversation mode ### Read-Aloud Service Redesign - Replace automatic clipboard reader with on-demand Ctrl+middle-click - New middle_click_reader.py service - Works anywhere: highlight text, Ctrl+middle-click to read - Uses Edge-TTS (Christopher voice) with mpv playback - Lock file prevents feedback with dictation service ### Conversation Mode Removed - Delete all VLLM/conversation code (VLLMClient, ConversationManager, TTS) - Archive 5 old implementations to archive/old_implementations/ - Remove conversation-related scripts and services - Clean separation of concerns for future reintegration if needed ### Dependencies Cleanup - Remove: openai, aiohttp, pyttsx3, requests (conversation deps) - Keep: PyGObject, pynput, sounddevice, vosk, numpy, edge-tts - Net reduction: 4 packages removed, 6 core packages retained ### Testing Improvements - Add test_dictation_service.py (8 tests) ✅ - Add test_middle_click.py (11 tests) ✅ - Fix test_run.py to use correct model path - Total: 19 unit tests passing - Delete obsolete test files (test_suite, test_vllm_integration, etc.) ### Documentation - Add CHANGES.md with complete changelog - Add docs/MIGRATION_GUIDE.md for upgrading - Add README.md with quick start guide - Update docs/README.md with current features only - Add justfile for common tasks ### New Services & Scripts - Add middle-click-reader.service (systemd) - Add scripts/setup-middle-click-reader.sh - Add desktop files for autostart - Remove toggle-conversation.sh (obsolete) ## Impact **Code Quality** - Net change: -6,007 lines (596 added, 6,603 deleted) - Simpler architecture, easier maintenance - Better test coverage (19 tests vs mixed before) - Cleaner separation of concerns **User Experience** - No notification spam during dictation - Clean visual status via tray icon - Full control over read-aloud (no unwanted readings) - Better performance (fewer background processes) **Privacy** - No conversation data stored - No VLLM connection needed - All processing local except Edge-TTS text ## Migration Notes Users upgrading should: 1. Run `uv sync` to update dependencies 2. Restart dictation.service to get tray icon 3. Run scripts/setup-middle-click-reader.sh for new read-aloud 4. Remove old read-aloud.service if present See docs/MIGRATION_GUIDE.md for details. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
264 lines
8.2 KiB
Python
264 lines
8.2 KiB
Python
#!/mnt/storage/Development/dictation-service/.venv/bin/python
|
|
import os
|
|
import sys
|
|
import queue
|
|
import json
|
|
import time
|
|
import subprocess
|
|
import threading
|
|
import sounddevice as sd
|
|
from vosk import Model, KaldiRecognizer
|
|
from pynput.keyboard import Controller
|
|
import logging
|
|
import gi
|
|
gi.require_version('Gtk', '3.0')
|
|
from gi.repository import Gtk, GLib
|
|
|
|
# Setup logging
|
|
logging.basicConfig(filename='/home/universal/.gemini/tmp/428d098e581799ff7817b2001dd545f7b891975897338dd78498cc16582e004f/debug.log', level=logging.DEBUG)
|
|
|
|
# Configuration
|
|
MODEL_NAME = "vosk-model-small-en-us-0.15" # Small model (fast)
|
|
SAMPLE_RATE = 16000
|
|
BLOCK_SIZE = 8000
|
|
LOCK_FILE = "listening.lock"
|
|
|
|
# Global State
|
|
is_listening = False
|
|
keyboard = Controller()
|
|
q = queue.Queue()
|
|
streaming_window = None
|
|
last_partial_text = ""
|
|
typing_buffer = ""
|
|
|
|
class StreamingWindow(Gtk.Window):
|
|
"""Small floating window that shows real-time transcription"""
|
|
def __init__(self):
|
|
super().__init__(title="Live Dictation")
|
|
self.set_title("Live Dictation")
|
|
self.set_default_size(400, 150)
|
|
self.set_keep_above(True)
|
|
self.set_decorated(True)
|
|
self.set_resizable(True)
|
|
self.set_position(Gtk.WindowPosition.MOUSE)
|
|
|
|
# Set styling
|
|
self.set_border_width(10)
|
|
self.override_background_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(0.2, 0.2, 0.2, 0.9))
|
|
|
|
# Create label for showing text
|
|
self.label = Gtk.Label()
|
|
self.label.set_text("🎤 Listening...")
|
|
self.label.set_justify(Gtk.Justification.LEFT)
|
|
self.label.set_line_wrap(True)
|
|
self.label.set_max_width_chars(50)
|
|
|
|
# Style the label
|
|
self.label.override_color(Gtk.StateFlags.NORMAL, Gdk.RGBA(1, 1, 1, 1))
|
|
|
|
# Add to window
|
|
self.add(self.label)
|
|
self.show_all()
|
|
|
|
logging.info("Streaming window created")
|
|
|
|
def update_text(self, text, is_partial=False):
|
|
"""Update the window with new text"""
|
|
GLib.idle_add(self._update_text_glib, text, is_partial)
|
|
|
|
def _update_text_glib(self, text, is_partial):
|
|
"""Update text in main thread"""
|
|
if is_partial:
|
|
display_text = f"💭 {text}"
|
|
else:
|
|
display_text = f"✅ {text}"
|
|
|
|
self.label.set_text(display_text)
|
|
|
|
# Auto-hide after 3 seconds of final text
|
|
if not is_partial and text:
|
|
threading.Timer(3.0, self.hide_window).start()
|
|
|
|
def hide_window(self):
|
|
"""Hide the window"""
|
|
GLib.idle_add(self.hide)
|
|
|
|
def close_window(self):
|
|
"""Close the window"""
|
|
GLib.idle_add(self.destroy)
|
|
|
|
def send_notification(title, message):
|
|
"""Sends a system notification"""
|
|
try:
|
|
subprocess.run(["notify-send", "-t", "2000", title, message], capture_output=True)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
def download_model_if_needed():
|
|
"""Checks if model exists, otherwise downloads it"""
|
|
if not os.path.exists(MODEL_NAME):
|
|
logging.info(f"Model '{MODEL_NAME}' not found. Downloading...")
|
|
try:
|
|
subprocess.check_call(["wget", f"https://alphacephei.com/vosk/models/{MODEL_NAME}.zip"])
|
|
subprocess.check_call(["unzip", f"{MODEL_NAME}.zip"])
|
|
logging.info("Download complete.")
|
|
except Exception as e:
|
|
logging.error(f"Error downloading model: {e}")
|
|
sys.exit(1)
|
|
|
|
def audio_callback(indata, frames, time, status):
|
|
"""Audio callback for processing sound"""
|
|
if status:
|
|
logging.warning(status)
|
|
if is_listening:
|
|
q.put(bytes(indata))
|
|
|
|
def process_partial_text(text):
|
|
"""Process and display partial results (streaming)"""
|
|
global last_partial_text
|
|
|
|
if text != last_partial_text:
|
|
last_partial_text = text
|
|
logging.info(f"Partial: {text}")
|
|
|
|
# Update streaming window
|
|
if streaming_window:
|
|
streaming_window.update_text(text, is_partial=True)
|
|
|
|
def process_final_text(text):
|
|
"""Process and type final results"""
|
|
global typing_buffer, last_partial_text
|
|
|
|
if not text:
|
|
return
|
|
|
|
# Format text
|
|
formatted = text.strip()
|
|
if not formatted:
|
|
return
|
|
|
|
# Capitalize first letter
|
|
formatted = formatted[0].upper() + formatted[1:]
|
|
|
|
logging.info(f"Final: {formatted}")
|
|
|
|
# Update streaming window
|
|
if streaming_window:
|
|
streaming_window.update_text(formatted, is_partial=False)
|
|
|
|
# Type the text
|
|
try:
|
|
keyboard.type(formatted + " ")
|
|
logging.info(f"Typed: {formatted}")
|
|
except Exception as e:
|
|
logging.error(f"Error typing: {e}")
|
|
|
|
# Clear partial text
|
|
last_partial_text = ""
|
|
|
|
def show_streaming_window():
|
|
"""Create and show the streaming window"""
|
|
global streaming_window
|
|
try:
|
|
from gi.repository import Gdk
|
|
Gdk.init([])
|
|
|
|
# Run in main thread
|
|
def create_window():
|
|
global streaming_window
|
|
streaming_window = StreamingWindow()
|
|
|
|
# Use idle_add to run in main thread
|
|
GLib.idle_add(create_window)
|
|
|
|
# Start GTK main loop in separate thread
|
|
def gtk_main():
|
|
import gtk
|
|
gtk.main()
|
|
|
|
threading.Thread(target=gtk_main, daemon=True).start()
|
|
time.sleep(0.5) # Give window time to appear
|
|
|
|
except Exception as e:
|
|
logging.error(f"Could not create streaming window: {e}")
|
|
# Fallback to just notifications
|
|
send_notification("Dictation", "🎤 Listening...")
|
|
|
|
def hide_streaming_window():
|
|
"""Hide the streaming window"""
|
|
global streaming_window
|
|
if streaming_window:
|
|
streaming_window.close_window()
|
|
streaming_window = None
|
|
|
|
def main():
|
|
try:
|
|
logging.info("Starting enhanced streaming dictation")
|
|
global is_listening
|
|
|
|
# Model Setup
|
|
download_model_if_needed()
|
|
logging.info("Loading model...")
|
|
model = Model(MODEL_NAME)
|
|
recognizer = KaldiRecognizer(model, SAMPLE_RATE)
|
|
logging.info("Model loaded successfully")
|
|
|
|
logging.info("=== Enhanced Dictation Ready ===")
|
|
logging.info("Features: Real-time streaming + visual feedback")
|
|
|
|
with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, dtype='int16',
|
|
channels=1, callback=audio_callback):
|
|
logging.info("Audio stream opened")
|
|
|
|
while True:
|
|
# Check lock file for state changes
|
|
lock_exists = os.path.exists(LOCK_FILE)
|
|
|
|
if lock_exists and not is_listening:
|
|
is_listening = True
|
|
logging.info("\n[Dictation] STARTED listening...")
|
|
send_notification("Dictation", "🎤 Streaming enabled")
|
|
show_streaming_window()
|
|
|
|
elif not lock_exists and is_listening:
|
|
is_listening = False
|
|
logging.info("\n[Dictation] STOPPED listening.")
|
|
send_notification("Dictation", "🛑 Stopped")
|
|
hide_streaming_window()
|
|
|
|
# If not listening, save CPU
|
|
if not is_listening:
|
|
time.sleep(0.1)
|
|
continue
|
|
|
|
# Process audio when listening
|
|
try:
|
|
data = q.get(timeout=0.1)
|
|
|
|
# Check for partial results
|
|
if recognizer.PartialResult():
|
|
partial = json.loads(recognizer.PartialResult())
|
|
partial_text = partial.get("partial", "")
|
|
if partial_text:
|
|
process_partial_text(partial_text)
|
|
|
|
# Check for final results
|
|
if recognizer.AcceptWaveform(data):
|
|
result = json.loads(recognizer.Result())
|
|
final_text = result.get("text", "")
|
|
if final_text:
|
|
process_final_text(final_text)
|
|
|
|
except queue.Empty:
|
|
pass
|
|
except Exception as e:
|
|
logging.error(f"Audio processing error: {e}")
|
|
|
|
except KeyboardInterrupt:
|
|
logging.info("\nExiting...")
|
|
hide_streaming_window()
|
|
except Exception as e:
|
|
logging.error(f"Fatal error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |