Implement Phase 2: Real offline speech-to-text with whisper.cpp
- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration - Implement complete WhisperCPPEngine with audio transcription and text normalization - Build ModelManager with curated catalog, downloads, and Core ML encoder support - Create preferences window with model management UI (download, select, delete) - Add NSStatusItem menu bar with model status display - Integrate STT pipeline: hotkey → audio capture → whisper transcription - Add model setup alerts when no model is loaded - Support offline operation with performance targets met (<4s for 10s audio) - Store models in ~/Library/Application Support/MenuWhisper/Models/ Phase 2 TECHSPEC requirements fully implemented and tested.
This commit is contained in:
parent
6e768a7753
commit
5663f3c3de
12 changed files with 1500 additions and 100 deletions
36
Sources/App/Resources/Info.plist
Normal file
36
Sources/App/Resources/Info.plist
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>en</string>
|
||||
<key>CFBundleDisplayName</key>
|
||||
<string>Menu-Whisper</string>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>MenuWhisper</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>com.menuwhisper.app</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>Menu-Whisper</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>APPL</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
<key>LSMinimumSystemVersion</key>
|
||||
<string>13.0</string>
|
||||
<key>LSUIElement</key>
|
||||
<true/>
|
||||
<key>NSHumanReadableCopyright</key>
|
||||
<string>Copyright © 2025. All rights reserved.</string>
|
||||
<key>NSMicrophoneUsageDescription</key>
|
||||
<string>Menu-Whisper needs access to your microphone to capture speech for offline transcription. Your audio data never leaves your device.</string>
|
||||
<key>NSSupportsAutomaticTermination</key>
|
||||
<true/>
|
||||
<key>NSSupportsSuddenTermination</key>
|
||||
<false/>
|
||||
</dict>
|
||||
</plist>
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/* Menu-Whisper - English Localization */
|
||||
|
||||
/* General */
|
||||
"app.name" = "Menu-Whisper";
|
||||
"general.ok" = "OK";
|
||||
"general.cancel" = "Cancel";
|
||||
"general.continue" = "Continue";
|
||||
"general.settings" = "Settings";
|
||||
"general.quit" = "Quit";
|
||||
|
||||
/* Menu Bar */
|
||||
"menubar.idle" = "Idle";
|
||||
"menubar.listening" = "Listening";
|
||||
"menubar.processing" = "Processing";
|
||||
"menubar.preferences" = "Preferences...";
|
||||
"menubar.quit" = "Quit Menu-Whisper";
|
||||
|
||||
/* HUD States */
|
||||
"hud.listening" = "Listening...";
|
||||
"hud.processing" = "Transcribing...";
|
||||
"hud.cancel" = "Press Esc to cancel";
|
||||
|
||||
/* Permissions */
|
||||
"permissions.microphone.title" = "Microphone Access Required";
|
||||
"permissions.microphone.message" = "Menu-Whisper needs access to your microphone to perform speech-to-text transcription.";
|
||||
"permissions.accessibility.title" = "Accessibility Access Required";
|
||||
"permissions.accessibility.message" = "Menu-Whisper needs Accessibility access to insert transcribed text into applications.";
|
||||
"permissions.input_monitoring.title" = "Input Monitoring Required";
|
||||
"permissions.input_monitoring.message" = "Menu-Whisper needs Input Monitoring access to register global hotkeys.";
|
||||
"permissions.open_settings" = "Open System Settings";
|
||||
|
||||
/* Preferences Window */
|
||||
"preferences.title" = "Menu-Whisper Preferences";
|
||||
"preferences.general" = "General";
|
||||
"preferences.models" = "Models";
|
||||
"preferences.hotkeys" = "Hotkeys";
|
||||
"preferences.insertion" = "Text Insertion";
|
||||
"preferences.advanced" = "Advanced";
|
||||
|
||||
/* General Preferences */
|
||||
"preferences.general.hotkey" = "Global Hotkey:";
|
||||
"preferences.general.mode" = "Activation Mode:";
|
||||
"preferences.general.mode.push_to_talk" = "Push-to-talk";
|
||||
"preferences.general.mode.toggle" = "Toggle";
|
||||
"preferences.general.sounds" = "Play sounds for start/stop";
|
||||
"preferences.general.limit" = "Dictation time limit (minutes):";
|
||||
|
||||
/* Model Preferences */
|
||||
"preferences.models.title" = "Speech Recognition Models";
|
||||
"preferences.models.active" = "Active Model:";
|
||||
"preferences.models.language" = "Language:";
|
||||
"preferences.models.language.auto" = "Auto-detect";
|
||||
"preferences.models.download" = "Download";
|
||||
"preferences.models.delete" = "Delete";
|
||||
"preferences.models.size" = "Size:";
|
||||
"preferences.models.languages" = "Languages:";
|
||||
|
||||
/* Insertion Preferences */
|
||||
"preferences.insertion.method" = "Insertion Method:";
|
||||
"preferences.insertion.method.paste" = "Paste (⌘V)";
|
||||
"preferences.insertion.method.type" = "Type characters";
|
||||
"preferences.insertion.preview" = "Show preview before inserting";
|
||||
"preferences.insertion.secure_input" = "Secure Input Detected";
|
||||
"preferences.insertion.secure_input.message" = "Text insertion is disabled in secure contexts. Text has been copied to clipboard.";
|
||||
|
||||
/* Errors */
|
||||
"error.audio.failed" = "Failed to access microphone";
|
||||
"error.model.not_found" = "Speech recognition model not found";
|
||||
"error.model.load_failed" = "Failed to load speech recognition model";
|
||||
"error.transcription.failed" = "Speech transcription failed";
|
||||
"error.download.failed" = "Model download failed";
|
||||
"error.download.verification_failed" = "Model verification failed";
|
||||
|
||||
/* Success Messages */
|
||||
"success.model.downloaded" = "Model downloaded successfully";
|
||||
"success.settings.exported" = "Settings exported successfully";
|
||||
"success.settings.imported" = "Settings imported successfully";
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/* Menu-Whisper - Spanish Localization */
|
||||
|
||||
/* General */
|
||||
"app.name" = "Menu-Whisper";
|
||||
"general.ok" = "Aceptar";
|
||||
"general.cancel" = "Cancelar";
|
||||
"general.continue" = "Continuar";
|
||||
"general.settings" = "Configuración";
|
||||
"general.quit" = "Salir";
|
||||
|
||||
/* Menu Bar */
|
||||
"menubar.idle" = "Inactivo";
|
||||
"menubar.listening" = "Escuchando";
|
||||
"menubar.processing" = "Procesando";
|
||||
"menubar.preferences" = "Preferencias...";
|
||||
"menubar.quit" = "Salir de Menu-Whisper";
|
||||
|
||||
/* HUD States */
|
||||
"hud.listening" = "Escuchando...";
|
||||
"hud.processing" = "Transcribiendo...";
|
||||
"hud.cancel" = "Presiona Esc para cancelar";
|
||||
|
||||
/* Permissions */
|
||||
"permissions.microphone.title" = "Acceso al Micrófono Requerido";
|
||||
"permissions.microphone.message" = "Menu-Whisper necesita acceso a tu micrófono para realizar la transcripción de voz a texto.";
|
||||
"permissions.accessibility.title" = "Acceso de Accesibilidad Requerido";
|
||||
"permissions.accessibility.message" = "Menu-Whisper necesita acceso de Accesibilidad para insertar texto transcrito en aplicaciones.";
|
||||
"permissions.input_monitoring.title" = "Monitoreo de Entrada Requerido";
|
||||
"permissions.input_monitoring.message" = "Menu-Whisper necesita acceso de Monitoreo de Entrada para registrar atajos de teclado globales.";
|
||||
"permissions.open_settings" = "Abrir Configuración del Sistema";
|
||||
|
||||
/* Preferences Window */
|
||||
"preferences.title" = "Preferencias de Menu-Whisper";
|
||||
"preferences.general" = "General";
|
||||
"preferences.models" = "Modelos";
|
||||
"preferences.hotkeys" = "Atajos";
|
||||
"preferences.insertion" = "Inserción de Texto";
|
||||
"preferences.advanced" = "Avanzado";
|
||||
|
||||
/* General Preferences */
|
||||
"preferences.general.hotkey" = "Atajo Global:";
|
||||
"preferences.general.mode" = "Modo de Activación:";
|
||||
"preferences.general.mode.push_to_talk" = "Presionar para hablar";
|
||||
"preferences.general.mode.toggle" = "Alternar";
|
||||
"preferences.general.sounds" = "Reproducir sonidos al iniciar/detener";
|
||||
"preferences.general.limit" = "Límite de tiempo de dictado (minutos):";
|
||||
|
||||
/* Model Preferences */
|
||||
"preferences.models.title" = "Modelos de Reconocimiento de Voz";
|
||||
"preferences.models.active" = "Modelo Activo:";
|
||||
"preferences.models.language" = "Idioma:";
|
||||
"preferences.models.language.auto" = "Detección automática";
|
||||
"preferences.models.download" = "Descargar";
|
||||
"preferences.models.delete" = "Eliminar";
|
||||
"preferences.models.size" = "Tamaño:";
|
||||
"preferences.models.languages" = "Idiomas:";
|
||||
|
||||
/* Insertion Preferences */
|
||||
"preferences.insertion.method" = "Método de Inserción:";
|
||||
"preferences.insertion.method.paste" = "Pegar (⌘V)";
|
||||
"preferences.insertion.method.type" = "Escribir caracteres";
|
||||
"preferences.insertion.preview" = "Mostrar vista previa antes de insertar";
|
||||
"preferences.insertion.secure_input" = "Entrada Segura Detectada";
|
||||
"preferences.insertion.secure_input.message" = "La inserción de texto está deshabilitada en contextos seguros. El texto se ha copiado al portapapeles.";
|
||||
|
||||
/* Errors */
|
||||
"error.audio.failed" = "Error al acceder al micrófono";
|
||||
"error.model.not_found" = "Modelo de reconocimiento de voz no encontrado";
|
||||
"error.model.load_failed" = "Error al cargar el modelo de reconocimiento de voz";
|
||||
"error.transcription.failed" = "Error en la transcripción de voz";
|
||||
"error.download.failed" = "Error en la descarga del modelo";
|
||||
"error.download.verification_failed" = "Error en la verificación del modelo";
|
||||
|
||||
/* Success Messages */
|
||||
"success.model.downloaded" = "Modelo descargado exitosamente";
|
||||
"success.settings.exported" = "Configuración exportada exitosamente";
|
||||
"success.settings.imported" = "Configuración importada exitosamente";
|
||||
160
Sources/App/Resources/model-catalog.json
Normal file
160
Sources/App/Resources/model-catalog.json
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
{
|
||||
"models": [
|
||||
{
|
||||
"name": "whisper-tiny",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 39,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "tiny",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin",
|
||||
"notes": "Fastest model, suitable for real-time applications with basic accuracy."
|
||||
},
|
||||
{
|
||||
"name": "whisper-tiny.en",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 39,
|
||||
"languages": ["en"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "tiny",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin",
|
||||
"notes": "English-only tiny model, slightly more accurate for English than multilingual tiny."
|
||||
},
|
||||
{
|
||||
"name": "whisper-base",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 142,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "base",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin",
|
||||
"notes": "Good balance of speed and accuracy, recommended for most use cases."
|
||||
},
|
||||
{
|
||||
"name": "whisper-base.en",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 142,
|
||||
"languages": ["en"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "base",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin",
|
||||
"notes": "English-only base model, optimal for English-only applications."
|
||||
},
|
||||
{
|
||||
"name": "whisper-small",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 466,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "small",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
|
||||
"notes": "Excellent balance of speed and accuracy for M1/M2/M3 machines."
|
||||
},
|
||||
{
|
||||
"name": "whisper-small.en",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 466,
|
||||
"languages": ["en"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "small",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin",
|
||||
"notes": "English-only small model, high accuracy for English-only use."
|
||||
},
|
||||
{
|
||||
"name": "whisper-medium",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 1540,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "medium",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
|
||||
"notes": "Higher accuracy but slower, requires more RAM (2-3GB)."
|
||||
},
|
||||
{
|
||||
"name": "whisper-medium.en",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 1540,
|
||||
"languages": ["en"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "medium",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin",
|
||||
"notes": "English-only medium model, very high accuracy for English."
|
||||
},
|
||||
{
|
||||
"name": "whisper-large-v2",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 3090,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "large",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin",
|
||||
"notes": "Highest accuracy but slowest, requires significant RAM (4-5GB)."
|
||||
},
|
||||
{
|
||||
"name": "whisper-large-v3",
|
||||
"family": "OpenAI-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 3090,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "large",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin",
|
||||
"notes": "Latest large model with improved accuracy, requires significant RAM (4-5GB)."
|
||||
},
|
||||
{
|
||||
"name": "distil-whisper-large-v2",
|
||||
"family": "Distil-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 1540,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "large",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/ggml-distil-large-v2.bin",
|
||||
"notes": "Distilled large model, 2x faster than large-v2 with similar accuracy."
|
||||
},
|
||||
{
|
||||
"name": "distil-whisper-large-v3",
|
||||
"family": "Distil-Whisper",
|
||||
"format": "bin",
|
||||
"size_mb": 1540,
|
||||
"languages": ["multilingual"],
|
||||
"recommended_backend": "whisper.cpp",
|
||||
"quality_tier": "large",
|
||||
"license": "MIT",
|
||||
"sha256": "",
|
||||
"download_url": "https://huggingface.co/distil-whisper/distil-large-v3/resolve/main/ggml-distil-large-v3.bin",
|
||||
"notes": "Latest distilled model, excellent balance of speed and accuracy."
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue