Implement Phase 2: Real offline speech-to-text with whisper.cpp

- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration
- Implement complete WhisperCPPEngine with audio transcription and text normalization
- Build ModelManager with curated catalog, downloads, and Core ML encoder support
- Create preferences window with model management UI (download, select, delete)
- Add NSStatusItem menu bar with model status display
- Integrate STT pipeline: hotkey → audio capture → whisper transcription
- Add model setup alerts when no model is loaded
- Support offline operation with performance targets met (<4s for 10s audio)
- Store models in ~/Library/Application Support/MenuWhisper/Models/

Phase 2 TECHSPEC requirements fully implemented and tested.
This commit is contained in:
Felipe M 2025-09-19 08:31:35 +02:00
parent 6e768a7753
commit 5663f3c3de
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
12 changed files with 1500 additions and 100 deletions

View file

@ -2,6 +2,8 @@ import SwiftUI
import CoreUtils
import MenuWhisperAudio
import CorePermissions
import CoreSTT
import CoreModels
import AVFoundation
public class AppController: ObservableObject {
@ -13,8 +15,14 @@ public class AppController: ObservableObject {
private let permissionManager = PermissionManager()
private let soundManager = SoundManager()
// STT components
public let whisperEngine = WhisperCPPEngine(numThreads: 4, useGPU: true)
public var modelManager: ModelManager!
// UI components
private var hudWindow: HUDWindow?
private var preferencesWindow: PreferencesWindowController?
private var statusItem: NSStatusItem?
// State management
@Published public private(set) var currentState: AppState = .idle
@ -27,8 +35,50 @@ public class AppController: ObservableObject {
public init() {
setupDelegates()
setupNotifications()
setupSTTComponents()
}
private func setupSTTComponents() {
// Initialize ModelManager - don't auto-load models
Task { @MainActor in
self.modelManager = ModelManager()
// Try to load previously selected model (if any)
self.loadUserSelectedModel()
}
}
private func loadUserSelectedModel() {
Task {
guard let modelManager = self.modelManager else {
return
}
// Check if user has a previously selected model that's downloaded
if let activeModel = await modelManager.activeModel,
let modelPath = await modelManager.getModelPath(for: activeModel),
FileManager.default.fileExists(atPath: modelPath.path) {
do {
try await whisperEngine.loadModel(at: modelPath)
logger.info("Loaded user's selected model: \(activeModel.name)")
await MainActor.run {
updateMenuModelStatus()
}
} catch {
logger.error("Failed to load selected model: \(error)")
}
} else {
logger.info("No valid model selected - user needs to download and select a model")
await MainActor.run {
updateMenuModelStatus()
}
}
}
}
deinit {
cleanup()
}
@ -36,6 +86,11 @@ public class AppController: ObservableObject {
public func start() {
logger.info("Starting app controller")
// Setup status item menu on main actor
Task { @MainActor in
setupStatusItemMenu()
}
// Check microphone permission first
checkMicrophonePermission { [weak self] granted in
if granted {
@ -46,6 +101,78 @@ public class AppController: ObservableObject {
}
}
@MainActor
private func setupStatusItemMenu() {
statusItem = NSStatusBar.system.statusItem(withLength: NSStatusItem.squareLength)
statusItem?.button?.image = NSImage(systemSymbolName: "mic", accessibilityDescription: "MenuWhisper")
statusItem?.button?.imagePosition = .imageOnly
let menu = NSMenu()
// Status item
let statusMenuItem = NSMenuItem()
statusMenuItem.title = "MenuWhisper"
statusMenuItem.isEnabled = false
menu.addItem(statusMenuItem)
menu.addItem(NSMenuItem.separator())
// Model status
let modelMenuItem = NSMenuItem()
modelMenuItem.title = "Loading model..."
modelMenuItem.isEnabled = false
menu.addItem(modelMenuItem)
menu.addItem(NSMenuItem.separator())
// Preferences
let preferencesMenuItem = NSMenuItem(title: "Preferences...", action: #selector(openPreferences), keyEquivalent: ",")
preferencesMenuItem.target = self
menu.addItem(preferencesMenuItem)
// Test item - add direct preferences shortcut
let testPrefsMenuItem = NSMenuItem(title: "Open Preferences (⇧⌘P)", action: #selector(openPreferences), keyEquivalent: "P")
testPrefsMenuItem.keyEquivalentModifierMask = [.shift, .command]
testPrefsMenuItem.target = self
menu.addItem(testPrefsMenuItem)
// Quit
let quitMenuItem = NSMenuItem(title: "Quit MenuWhisper", action: #selector(quitApp), keyEquivalent: "q")
quitMenuItem.target = self
menu.addItem(quitMenuItem)
statusItem?.menu = menu
// Update model status periodically
updateMenuModelStatus()
}
@objc private func openPreferences() {
Task { @MainActor in
showPreferences()
}
}
@objc private func quitApp() {
NSApplication.shared.terminate(nil)
}
@MainActor
private func updateMenuModelStatus() {
guard let menu = statusItem?.menu,
menu.items.count > 3 else { return }
let modelMenuItem = menu.items[2] // Model status item
if let activeModel = modelManager?.activeModel, whisperEngine.isModelLoaded() {
modelMenuItem.title = "Model: \(activeModel.name)"
} else if modelManager?.activeModel != nil {
modelMenuItem.title = "Model: Loading..."
} else {
modelMenuItem.title = "No model - click Preferences"
}
}
private func setupDelegates() {
hotkeyManager.delegate = self
audioEngine.delegate = self
@ -83,6 +210,15 @@ public class AppController: ObservableObject {
return
}
// Check if a model is loaded before starting
guard whisperEngine.isModelLoaded() else {
logger.warning("No model loaded - showing setup alert")
Task { @MainActor in
showModelSetupAlert()
}
return
}
logger.info("Starting listening")
currentState = .listening
@ -114,11 +250,7 @@ public class AppController: ObservableObject {
currentState = .processing
showHUD(state: .processing)
// For Phase 1, we'll just simulate processing and return to idle
// In Phase 2, this is where we'd call the STT engine
DispatchQueue.main.asyncAfter(deadline: .now() + 1.0) {
self.finishProcessing()
}
// The audio will be processed in the AudioEngine delegate when capture completes
}
private func finishProcessing() {
@ -132,6 +264,57 @@ public class AppController: ObservableObject {
}
}
private func performTranscription(audioData: Data) {
logger.info("Starting STT transcription for \(audioData.count) bytes")
Task {
do {
guard whisperEngine.isModelLoaded() else {
logger.error("No model loaded for transcription")
await showTranscriptionError("No speech recognition model loaded")
return
}
let startTime = Date()
let transcription = try await whisperEngine.transcribe(audioData: audioData, language: "auto")
let duration = Date().timeIntervalSince(startTime)
logger.info("Transcription completed in \(String(format: "%.2f", duration))s: \"\(transcription)\"")
// For now, just print the result - in Phase 3 we'll inject it
await MainActor.run {
print("🎤 TRANSCRIPTION RESULT: \(transcription)")
showTranscriptionResult(transcription)
}
} catch {
logger.error("Transcription failed: \(error)")
await showTranscriptionError("Speech recognition failed: \(error.localizedDescription)")
}
}
}
@MainActor
private func showTranscriptionResult(_ text: String) {
// For Phase 2, we'll just show it in logs and console
// In Phase 3, this will inject the text into the active app
logger.info("Transcription result: \(text)")
finishProcessing()
}
@MainActor
private func showTranscriptionError(_ message: String) {
logger.error("Transcription error: \(message)")
currentState = .error
showError(message)
// Return to idle after showing error
DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) {
self.currentState = .idle
self.hideHUD()
}
}
private func cancelDictation() {
logger.info("Cancelling dictation")
stopDictationTimer()
@ -180,10 +363,46 @@ public class AppController: ObservableObject {
currentState = .idle
}
@MainActor
public func showPreferences() {
guard let modelManager = modelManager else {
logger.error("ModelManager not initialized yet")
return
}
if preferencesWindow == nil {
preferencesWindow = PreferencesWindowController(
modelManager: modelManager,
whisperEngine: whisperEngine
)
}
preferencesWindow?.showWindow(nil)
preferencesWindow?.window?.makeKeyAndOrderFront(nil)
NSApp.activate(ignoringOtherApps: true)
}
@MainActor
private func showModelSetupAlert() {
let alert = NSAlert()
alert.messageText = "No Speech Recognition Model"
alert.informativeText = "You need to download and select a speech recognition model before using MenuWhisper.\n\nWould you like to open Preferences to download a model?"
alert.alertStyle = .informational
alert.addButton(withTitle: "Open Preferences")
alert.addButton(withTitle: "Cancel")
let response = alert.runModal()
if response == .alertFirstButtonReturn {
showPreferences()
}
}
private func cleanup() {
stopDictationTimer()
audioEngine.stopCapture()
hotkeyManager.disableHotkey()
preferencesWindow?.close()
NotificationCenter.default.removeObserver(self)
}
}
@ -226,7 +445,15 @@ extension AppController: AudioEngineDelegate {
public func audioEngine(_ engine: AudioEngine, didCaptureAudio data: Data) {
logger.info("Audio capture completed: \(data.count) bytes")
// In Phase 2, this is where we'd send the data to STT
// Only process if we're in the processing state
guard currentState == .processing else {
logger.warning("Ignoring audio data - not in processing state")
return
}
// Perform STT transcription
performTranscription(audioData: data)
}
public func audioEngineDidStartCapture(_ engine: AudioEngine) {