Implement Phase 2: Real offline speech-to-text with whisper.cpp

- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration
- Implement complete WhisperCPPEngine with audio transcription and text normalization
- Build ModelManager with curated catalog, downloads, and Core ML encoder support
- Create preferences window with model management UI (download, select, delete)
- Add NSStatusItem menu bar with model status display
- Integrate STT pipeline: hotkey → audio capture → whisper transcription
- Add model setup alerts when no model is loaded
- Support offline operation with performance targets met (<4s for 10s audio)
- Store models in ~/Library/Application Support/MenuWhisper/Models/

Phase 2 TECHSPEC requirements fully implemented and tested.
This commit is contained in:
Felipe M 2025-09-19 08:31:35 +02:00
parent 6e768a7753
commit 5663f3c3de
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
12 changed files with 1500 additions and 100 deletions

View file

@ -0,0 +1,160 @@
{
"models": [
{
"name": "whisper-tiny",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 39,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "tiny",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin",
"notes": "Fastest model, suitable for real-time applications with basic accuracy."
},
{
"name": "whisper-tiny.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 39,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "tiny",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin",
"notes": "English-only tiny model, slightly more accurate for English than multilingual tiny."
},
{
"name": "whisper-base",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 142,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "base",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin",
"notes": "Good balance of speed and accuracy, recommended for most use cases."
},
{
"name": "whisper-base.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 142,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "base",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin",
"notes": "English-only base model, optimal for English-only applications."
},
{
"name": "whisper-small",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 466,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "small",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
"notes": "Excellent balance of speed and accuracy for M1/M2/M3 machines."
},
{
"name": "whisper-small.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 466,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "small",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin",
"notes": "English-only small model, high accuracy for English-only use."
},
{
"name": "whisper-medium",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "medium",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
"notes": "Higher accuracy but slower, requires more RAM (2-3GB)."
},
{
"name": "whisper-medium.en",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["en"],
"recommended_backend": "whisper.cpp",
"quality_tier": "medium",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin",
"notes": "English-only medium model, very high accuracy for English."
},
{
"name": "whisper-large-v2",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 3090,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin",
"notes": "Highest accuracy but slowest, requires significant RAM (4-5GB)."
},
{
"name": "whisper-large-v3",
"family": "OpenAI-Whisper",
"format": "bin",
"size_mb": 3090,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin",
"notes": "Latest large model with improved accuracy, requires significant RAM (4-5GB)."
},
{
"name": "distil-whisper-large-v2",
"family": "Distil-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/ggml-distil-large-v2.bin",
"notes": "Distilled large model, 2x faster than large-v2 with similar accuracy."
},
{
"name": "distil-whisper-large-v3",
"family": "Distil-Whisper",
"format": "bin",
"size_mb": 1540,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "large",
"license": "MIT",
"sha256": "",
"download_url": "https://huggingface.co/distil-whisper/distil-large-v3/resolve/main/ggml-distil-large-v3.bin",
"notes": "Latest distilled model, excellent balance of speed and accuracy."
}
]
}