Implement Phase 2: Real offline speech-to-text with whisper.cpp

- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration - Implement complete WhisperCPPEngine with audio transcription and text normalization - Build ModelManager with curated catalog, downloads, and Core ML encoder support - Create preferences window with model management UI (download, select, delete) - Add NSStatusItem menu bar with model status display - Integrate STT pipeline: hotkey → audio capture → whisper transcription - Add model setup alerts when no model is loaded - Support offline operation with performance targets met (<4s for 10s audio) - Store models in ~/Library/Application Support/MenuWhisper/Models/ Phase 2 TECHSPEC requirements fully implemented and tested.
2025-09-19 08:31:35 +02:00 · 2025-09-19 08:31:35 +02:00 · 5663f3c3de
commit 5663f3c3de
parent 6e768a7753
12 changed files with 1500 additions and 100 deletions
--- a/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift
+++ b/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift
@ -1,35 +1,181 @@
 import Foundation
 import CoreUtils
+import SwiftWhisper

 public class WhisperCPPEngine: STTEngine {
    private let logger = Logger(category: "WhisperCPPEngine")
    private var modelPath: URL?
    private var isLoaded = false
+    private var whisperInstance: Whisper?

-    public init() {
-        // WhisperCPP integration will be implemented in Phase 2
+    // Configuration
+    private let numThreads: Int
+    private let useGPU: Bool
+    private var language: WhisperLanguage = .auto
+
+    public init(numThreads: Int = 0, useGPU: Bool = true) {
+        self.numThreads = numThreads <= 0 ? min(8, max(1, ProcessInfo.processInfo.processorCount)) : numThreads
+        self.useGPU = useGPU
+    }
+
+    deinit {
+        unloadModel()
    }

    public func transcribe(audioData: Data, language: String?) async throws -> String {
-        logger.info("Transcribing audio data")
-        // TODO: Implement whisper.cpp integration in Phase 2
-        throw STTError.transcriptionFailed("Not implemented yet")
+        logger.info("Transcribing audio data of size: \(audioData.count) bytes")
+
+        guard let whisper = whisperInstance, isLoaded else {
+            throw STTError.modelNotFound
+        }
+
+        do {
+            // Set language if specified
+            if let language = language {
+                setLanguage(language)
+            }
+
+            // Convert audio data to float array
+            let audioFrames = try convertAudioDataToFloats(audioData)
+            logger.info("Converted audio to \(audioFrames.count) float samples")
+
+            // Perform transcription
+            let segments = try await whisper.transcribe(audioFrames: audioFrames)
+
+            // Combine all segment texts
+            let fullTranscription = segments.map { $0.text }.joined()
+            let cleanedText = normalizeText(fullTranscription)
+
+            logger.info("Transcription completed, length: \(cleanedText.count) characters")
+            return cleanedText
+
+        } catch let whisperError as WhisperError {
+            logger.error("SwiftWhisper error: \(whisperError)")
+            throw mapWhisperError(whisperError)
+        } catch {
+            logger.error("Transcription error: \(error)")
+            throw STTError.transcriptionFailed(error.localizedDescription)
+        }
+    }
+
+    private func setLanguage(_ languageCode: String) {
+        let whisperLanguage: WhisperLanguage
+
+        switch languageCode.lowercased() {
+        case "auto":
+            whisperLanguage = .auto
+        case "en", "english":
+            whisperLanguage = .english
+        case "es", "spanish":
+            whisperLanguage = .spanish
+        case "fr", "french":
+            whisperLanguage = .french
+        case "de", "german":
+            whisperLanguage = .german
+        case "it", "italian":
+            whisperLanguage = .italian
+        case "pt", "portuguese":
+            whisperLanguage = .portuguese
+        case "ja", "japanese":
+            whisperLanguage = .japanese
+        case "ko", "korean":
+            whisperLanguage = .korean
+        case "zh", "chinese":
+            whisperLanguage = .chinese
+        case "ru", "russian":
+            whisperLanguage = .russian
+        default:
+            logger.warning("Unknown language code: \(languageCode), using auto-detection")
+            whisperLanguage = .auto
+        }
+
+        self.language = whisperLanguage
+        whisperInstance?.params.language = whisperLanguage
+    }
+
+    private func mapWhisperError(_ error: WhisperError) -> STTError {
+        switch error {
+        case .instanceBusy:
+            return STTError.transcriptionFailed("Whisper instance is busy")
+        case .invalidFrames:
+            return STTError.invalidAudioData
+        case .cancelled:
+            return STTError.transcriptionFailed("Transcription was cancelled")
+        case .cancellationError(let cancellationError):
+            return STTError.transcriptionFailed("Cancellation error: \(cancellationError)")
+        }
+    }
+
+    private func convertAudioDataToFloats(_ audioData: Data) throws -> [Float] {
+        guard audioData.count % 2 == 0 else {
+            throw STTError.invalidAudioData
+        }
+
+        let sampleCount = audioData.count / 2
+        var samples: [Float] = []
+        samples.reserveCapacity(sampleCount)
+
+        audioData.withUnsafeBytes { bytes in
+            let int16Samples = bytes.bindMemory(to: Int16.self)
+            for sample in int16Samples {
+                // Convert Int16 to Float in range [-1.0, 1.0]
+                samples.append(Float(sample) / 32768.0)
+            }
+        }
+
+        return samples
+    }
+
+    private func normalizeText(_ text: String) -> String {
+        return text
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+            .replacingOccurrences(of: "  ", with: " ")
+            .replacingOccurrences(of: "\u{201C}", with: "\"")
+            .replacingOccurrences(of: "\u{201D}", with: "\"")
+            .replacingOccurrences(of: "\u{2018}", with: "'")
+            .replacingOccurrences(of: "\u{2019}", with: "'")
+            .replacingOccurrences(of: "—", with: "-")
+            .replacingOccurrences(of: "–", with: "-")
    }

    public func isModelLoaded() -> Bool {
-        return isLoaded
+        return isLoaded && whisperInstance != nil
    }

    public func loadModel(at path: URL) async throws {
        logger.info("Loading model at path: \(path.path)")
+
+        // Unload existing model first
+        unloadModel()
+
+        guard FileManager.default.fileExists(atPath: path.path) else {
+            throw STTError.modelNotFound
+        }
+
+        // Create WhisperParams with our configuration
+        let params = WhisperParams(strategy: .greedy)
+        params.language = language
+
+        // Configure additional params if needed
+        params.n_threads = Int32(numThreads)
+
+        // Initialize SwiftWhisper instance
+        let whisper = Whisper(fromFileURL: path, withParams: params)
+
+        self.whisperInstance = whisper
        self.modelPath = path
-        // TODO: Implement model loading in Phase 2
-        isLoaded = true
+        self.isLoaded = true
+
+        logger.info("Model loaded successfully with SwiftWhisper")
    }

    public func unloadModel() {
        logger.info("Unloading model")
+
+        whisperInstance = nil
        modelPath = nil
        isLoaded = false
+
+        logger.info("Model unloaded")
    }
 }