Implement Phase 2: Real offline speech-to-text with whisper.cpp

- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration
- Implement complete WhisperCPPEngine with audio transcription and text normalization
- Build ModelManager with curated catalog, downloads, and Core ML encoder support
- Create preferences window with model management UI (download, select, delete)
- Add NSStatusItem menu bar with model status display
- Integrate STT pipeline: hotkey → audio capture → whisper transcription
- Add model setup alerts when no model is loaded
- Support offline operation with performance targets met (<4s for 10s audio)
- Store models in ~/Library/Application Support/MenuWhisper/Models/

Phase 2 TECHSPEC requirements fully implemented and tested.
This commit is contained in:
Felipe M 2025-09-19 08:31:35 +02:00
parent 6e768a7753
commit 5663f3c3de
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
12 changed files with 1500 additions and 100 deletions

View file

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>en</string>
<key>CFBundleDisplayName</key>
<string>Menu-Whisper</string>
<key>CFBundleExecutable</key>
<string>MenuWhisper</string>
<key>CFBundleIdentifier</key>
<string>com.menuwhisper.app</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
<string>Menu-Whisper</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>CFBundleShortVersionString</key>
<string>1.0.0</string>
<key>CFBundleVersion</key>
<string>1</string>
<key>LSMinimumSystemVersion</key>
<string>13.0</string>
<key>LSUIElement</key>
<true/>
<key>NSHumanReadableCopyright</key>
<string>Copyright © 2025. All rights reserved.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Menu-Whisper needs access to your microphone to capture speech for offline transcription. Your audio data never leaves your device.</string>
<key>NSSupportsAutomaticTermination</key>
<true/>
<key>NSSupportsSuddenTermination</key>
<false/>
</dict>
</plist>

View file

@ -0,0 +1,77 @@
/* Menu-Whisper - English Localization */
/* General */
"app.name" = "Menu-Whisper";
"general.ok" = "OK";
"general.cancel" = "Cancel";
"general.continue" = "Continue";
"general.settings" = "Settings";
"general.quit" = "Quit";
/* Menu Bar */
"menubar.idle" = "Idle";
"menubar.listening" = "Listening";
"menubar.processing" = "Processing";
"menubar.preferences" = "Preferences...";
"menubar.quit" = "Quit Menu-Whisper";
/* HUD States */
"hud.listening" = "Listening...";
"hud.processing" = "Transcribing...";
"hud.cancel" = "Press Esc to cancel";
/* Permissions */
"permissions.microphone.title" = "Microphone Access Required";
"permissions.microphone.message" = "Menu-Whisper needs access to your microphone to perform speech-to-text transcription.";
"permissions.accessibility.title" = "Accessibility Access Required";
"permissions.accessibility.message" = "Menu-Whisper needs Accessibility access to insert transcribed text into applications.";
"permissions.input_monitoring.title" = "Input Monitoring Required";
"permissions.input_monitoring.message" = "Menu-Whisper needs Input Monitoring access to register global hotkeys.";
"permissions.open_settings" = "Open System Settings";
/* Preferences Window */
"preferences.title" = "Menu-Whisper Preferences";
"preferences.general" = "General";
"preferences.models" = "Models";
"preferences.hotkeys" = "Hotkeys";
"preferences.insertion" = "Text Insertion";
"preferences.advanced" = "Advanced";
/* General Preferences */
"preferences.general.hotkey" = "Global Hotkey:";
"preferences.general.mode" = "Activation Mode:";
"preferences.general.mode.push_to_talk" = "Push-to-talk";
"preferences.general.mode.toggle" = "Toggle";
"preferences.general.sounds" = "Play sounds for start/stop";
"preferences.general.limit" = "Dictation time limit (minutes):";
/* Model Preferences */
"preferences.models.title" = "Speech Recognition Models";
"preferences.models.active" = "Active Model:";
"preferences.models.language" = "Language:";
"preferences.models.language.auto" = "Auto-detect";
"preferences.models.download" = "Download";
"preferences.models.delete" = "Delete";
"preferences.models.size" = "Size:";
"preferences.models.languages" = "Languages:";
/* Insertion Preferences */
"preferences.insertion.method" = "Insertion Method:";
"preferences.insertion.method.paste" = "Paste (⌘V)";
"preferences.insertion.method.type" = "Type characters";
"preferences.insertion.preview" = "Show preview before inserting";
"preferences.insertion.secure_input" = "Secure Input Detected";
"preferences.insertion.secure_input.message" = "Text insertion is disabled in secure contexts. Text has been copied to clipboard.";
/* Errors */
"error.audio.failed" = "Failed to access microphone";
"error.model.not_found" = "Speech recognition model not found";
"error.model.load_failed" = "Failed to load speech recognition model";
"error.transcription.failed" = "Speech transcription failed";
"error.download.failed" = "Model download failed";
"error.download.verification_failed" = "Model verification failed";
/* Success Messages */
"success.model.downloaded" = "Model downloaded successfully";
"success.settings.exported" = "Settings exported successfully";
"success.settings.imported" = "Settings imported successfully";

View file

@ -0,0 +1,77 @@
/* Menu-Whisper - Spanish Localization */
/* General */
"app.name" = "Menu-Whisper";
"general.ok" = "Aceptar";
"general.cancel" = "Cancelar";
"general.continue" = "Continuar";
"general.settings" = "Configuración";
"general.quit" = "Salir";
/* Menu Bar */
"menubar.idle" = "Inactivo";
"menubar.listening" = "Escuchando";
"menubar.processing" = "Procesando";
"menubar.preferences" = "Preferencias...";
"menubar.quit" = "Salir de Menu-Whisper";
/* HUD States */
"hud.listening" = "Escuchando...";
"hud.processing" = "Transcribiendo...";
"hud.cancel" = "Presiona Esc para cancelar";
/* Permissions */
"permissions.microphone.title" = "Acceso al Micrófono Requerido";
"permissions.microphone.message" = "Menu-Whisper necesita acceso a tu micrófono para realizar la transcripción de voz a texto.";
"permissions.accessibility.title" = "Acceso de Accesibilidad Requerido";
"permissions.accessibility.message" = "Menu-Whisper necesita acceso de Accesibilidad para insertar texto transcrito en aplicaciones.";
"permissions.input_monitoring.title" = "Monitoreo de Entrada Requerido";
"permissions.input_monitoring.message" = "Menu-Whisper necesita acceso de Monitoreo de Entrada para registrar atajos de teclado globales.";
"permissions.open_settings" = "Abrir Configuración del Sistema";
/* Preferences Window */
"preferences.title" = "Preferencias de Menu-Whisper";
"preferences.general" = "General";
"preferences.models" = "Modelos";
"preferences.hotkeys" = "Atajos";
"preferences.insertion" = "Inserción de Texto";
"preferences.advanced" = "Avanzado";
/* General Preferences */
"preferences.general.hotkey" = "Atajo Global:";
"preferences.general.mode" = "Modo de Activación:";
"preferences.general.mode.push_to_talk" = "Presionar para hablar";
"preferences.general.mode.toggle" = "Alternar";
"preferences.general.sounds" = "Reproducir sonidos al iniciar/detener";
"preferences.general.limit" = "Límite de tiempo de dictado (minutos):";
/* Model Preferences */
"preferences.models.title" = "Modelos de Reconocimiento de Voz";
"preferences.models.active" = "Modelo Activo:";
"preferences.models.language" = "Idioma:";
"preferences.models.language.auto" = "Detección automática";
"preferences.models.download" = "Descargar";
"preferences.models.delete" = "Eliminar";
"preferences.models.size" = "Tamaño:";
"preferences.models.languages" = "Idiomas:";
/* Insertion Preferences */
"preferences.insertion.method" = "Método de Inserción:";
"preferences.insertion.method.paste" = "Pegar (⌘V)";
"preferences.insertion.method.type" = "Escribir caracteres";
"preferences.insertion.preview" = "Mostrar vista previa antes de insertar";
"preferences.insertion.secure_input" = "Entrada Segura Detectada";
"preferences.insertion.secure_input.message" = "La inserción de texto está deshabilitada en contextos seguros. El texto se ha copiado al portapapeles.";
/* Errors */
"error.audio.failed" = "Error al acceder al micrófono";
"error.model.not_found" = "Modelo de reconocimiento de voz no encontrado";
"error.model.load_failed" = "Error al cargar el modelo de reconocimiento de voz";
"error.transcription.failed" = "Error en la transcripción de voz";
"error.download.failed" = "Error en la descarga del modelo";
"error.download.verification_failed" = "Error en la verificación del modelo";
/* Success Messages */
"success.model.downloaded" = "Modelo descargado exitosamente";
"success.settings.exported" = "Configuración exportada exitosamente";
"success.settings.imported" = "Configuración importada exitosamente";

View file

@ -0,0 +1,160 @@
{
"models": [
{
"name": "whisper-tiny",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 39,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "tiny",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin",
"notes": "Fastest model, suitable for real-time applications with basic accuracy."
},
{
"name": "whisper-tiny.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 39,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "tiny",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin",
"notes": "English-only tiny model, slightly more accurate for English than multilingual tiny."
},
{
"name": "whisper-base",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 142,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "base",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin",
"notes": "Good balance of speed and accuracy, recommended for most use cases."
},
{
"name": "whisper-base.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 142,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "base",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin",
"notes": "English-only base model, optimal for English-only applications."
},
{
"name": "whisper-small",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 466,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "small",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
"notes": "Excellent balance of speed and accuracy for M1/M2/M3 machines."
},
{
"name": "whisper-small.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 466,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "small",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin",
"notes": "English-only small model, high accuracy for English-only use."
},
{
"name": "whisper-medium",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "medium",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
"notes": "Higher accuracy but slower, requires more RAM (2-3GB)."
},
{
"name": "whisper-medium.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "medium",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin",
"notes": "English-only medium model, very high accuracy for English."
},
{
"name": "whisper-large-v2",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 3090,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin",
"notes": "Highest accuracy but slowest, requires significant RAM (4-5GB)."
},
{
"name": "whisper-large-v3",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 3090,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin",
"notes": "Latest large model with improved accuracy, requires significant RAM (4-5GB)."
},
{
"name": "distil-whisper-large-v2",
"family": "Distil-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/ggml-distil-large-v2.bin",
"notes": "Distilled large model, 2x faster than large-v2 with similar accuracy."
},
{
"name": "distil-whisper-large-v3",
"family": "Distil-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/distil-whisper/distil-large-v3/resolve/main/ggml-distil-large-v3.bin",
"notes": "Latest distilled model, excellent balance of speed and accuracy."
}
]
}