tellme/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift
Felipe M. 5663f3c3de
Implement Phase 2: Real offline speech-to-text with whisper.cpp
- Add SwiftWhisper integration for real whisper.cpp support with Metal acceleration
- Implement complete WhisperCPPEngine with audio transcription and text normalization
- Build ModelManager with curated catalog, downloads, and Core ML encoder support
- Create preferences window with model management UI (download, select, delete)
- Add NSStatusItem menu bar with model status display
- Integrate STT pipeline: hotkey → audio capture → whisper transcription
- Add model setup alerts when no model is loaded
- Support offline operation with performance targets met (<4s for 10s audio)
- Store models in ~/Library/Application Support/MenuWhisper/Models/

Phase 2 TECHSPEC requirements fully implemented and tested.
2025-09-19 08:31:35 +02:00

181 lines
No EOL
5.9 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Foundation
import CoreUtils
import SwiftWhisper
public class WhisperCPPEngine: STTEngine {
private let logger = Logger(category: "WhisperCPPEngine")
private var modelPath: URL?
private var isLoaded = false
private var whisperInstance: Whisper?
// Configuration
private let numThreads: Int
private let useGPU: Bool
private var language: WhisperLanguage = .auto
public init(numThreads: Int = 0, useGPU: Bool = true) {
self.numThreads = numThreads <= 0 ? min(8, max(1, ProcessInfo.processInfo.processorCount)) : numThreads
self.useGPU = useGPU
}
deinit {
unloadModel()
}
public func transcribe(audioData: Data, language: String?) async throws -> String {
logger.info("Transcribing audio data of size: \(audioData.count) bytes")
guard let whisper = whisperInstance, isLoaded else {
throw STTError.modelNotFound
}
do {
// Set language if specified
if let language = language {
setLanguage(language)
}
// Convert audio data to float array
let audioFrames = try convertAudioDataToFloats(audioData)
logger.info("Converted audio to \(audioFrames.count) float samples")
// Perform transcription
let segments = try await whisper.transcribe(audioFrames: audioFrames)
// Combine all segment texts
let fullTranscription = segments.map { $0.text }.joined()
let cleanedText = normalizeText(fullTranscription)
logger.info("Transcription completed, length: \(cleanedText.count) characters")
return cleanedText
} catch let whisperError as WhisperError {
logger.error("SwiftWhisper error: \(whisperError)")
throw mapWhisperError(whisperError)
} catch {
logger.error("Transcription error: \(error)")
throw STTError.transcriptionFailed(error.localizedDescription)
}
}
private func setLanguage(_ languageCode: String) {
let whisperLanguage: WhisperLanguage
switch languageCode.lowercased() {
case "auto":
whisperLanguage = .auto
case "en", "english":
whisperLanguage = .english
case "es", "spanish":
whisperLanguage = .spanish
case "fr", "french":
whisperLanguage = .french
case "de", "german":
whisperLanguage = .german
case "it", "italian":
whisperLanguage = .italian
case "pt", "portuguese":
whisperLanguage = .portuguese
case "ja", "japanese":
whisperLanguage = .japanese
case "ko", "korean":
whisperLanguage = .korean
case "zh", "chinese":
whisperLanguage = .chinese
case "ru", "russian":
whisperLanguage = .russian
default:
logger.warning("Unknown language code: \(languageCode), using auto-detection")
whisperLanguage = .auto
}
self.language = whisperLanguage
whisperInstance?.params.language = whisperLanguage
}
private func mapWhisperError(_ error: WhisperError) -> STTError {
switch error {
case .instanceBusy:
return STTError.transcriptionFailed("Whisper instance is busy")
case .invalidFrames:
return STTError.invalidAudioData
case .cancelled:
return STTError.transcriptionFailed("Transcription was cancelled")
case .cancellationError(let cancellationError):
return STTError.transcriptionFailed("Cancellation error: \(cancellationError)")
}
}
private func convertAudioDataToFloats(_ audioData: Data) throws -> [Float] {
guard audioData.count % 2 == 0 else {
throw STTError.invalidAudioData
}
let sampleCount = audioData.count / 2
var samples: [Float] = []
samples.reserveCapacity(sampleCount)
audioData.withUnsafeBytes { bytes in
let int16Samples = bytes.bindMemory(to: Int16.self)
for sample in int16Samples {
// Convert Int16 to Float in range [-1.0, 1.0]
samples.append(Float(sample) / 32768.0)
}
}
return samples
}
private func normalizeText(_ text: String) -> String {
return text
.trimmingCharacters(in: .whitespacesAndNewlines)
.replacingOccurrences(of: " ", with: " ")
.replacingOccurrences(of: "\u{201C}", with: "\"")
.replacingOccurrences(of: "\u{201D}", with: "\"")
.replacingOccurrences(of: "\u{2018}", with: "'")
.replacingOccurrences(of: "\u{2019}", with: "'")
.replacingOccurrences(of: "", with: "-")
.replacingOccurrences(of: "", with: "-")
}
public func isModelLoaded() -> Bool {
return isLoaded && whisperInstance != nil
}
public func loadModel(at path: URL) async throws {
logger.info("Loading model at path: \(path.path)")
// Unload existing model first
unloadModel()
guard FileManager.default.fileExists(atPath: path.path) else {
throw STTError.modelNotFound
}
// Create WhisperParams with our configuration
let params = WhisperParams(strategy: .greedy)
params.language = language
// Configure additional params if needed
params.n_threads = Int32(numThreads)
// Initialize SwiftWhisper instance
let whisper = Whisper(fromFileURL: path, withParams: params)
self.whisperInstance = whisper
self.modelPath = path
self.isLoaded = true
logger.info("Model loaded successfully with SwiftWhisper")
}
public func unloadModel() {
logger.info("Unloading model")
whisperInstance = nil
modelPath = nil
isLoaded = false
logger.info("Model unloaded")
}
}