Implement Phase 1: Global hotkey, HUD, and audio capture

Add complete listening UX without STT:
- Global hotkey manager with ⌘⇧V, push-to-talk and toggle modes
- Floating HUD with real-time RMS audio visualization
- AVAudioEngine capture with 16kHz mono PCM conversion
- 10-minute dictation timeout with ESC cancellation
- Optional start/stop sounds and microphone permissions
- Permission management for accessibility and input monitoring

All Phase 1 acceptance criteria met.
This commit is contained in:
Felipe M 2025-09-18 20:06:46 +02:00
parent 1db16227b2
commit 6e768a7753
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
10 changed files with 1005 additions and 51 deletions

36
Resources/Info.plist Normal file
View file

@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>en</string>
<key>CFBundleDisplayName</key>
<string>Menu-Whisper</string>
<key>CFBundleExecutable</key>
<string>MenuWhisper</string>
<key>CFBundleIdentifier</key>
<string>com.menuwhisper.app</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
<string>Menu-Whisper</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>CFBundleShortVersionString</key>
<string>1.0.0</string>
<key>CFBundleVersion</key>
<string>1</string>
<key>LSMinimumSystemVersion</key>
<string>13.0</string>
<key>LSUIElement</key>
<true/>
<key>NSHumanReadableCopyright</key>
<string>Copyright © 2025. All rights reserved.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Menu-Whisper needs access to your microphone to capture speech for offline transcription. Your audio data never leaves your device.</string>
<key>NSSupportsAutomaticTermination</key>
<true/>
<key>NSSupportsSuddenTermination</key>
<false/>
</dict>
</plist>

View file

@ -0,0 +1,239 @@
import SwiftUI
import CoreUtils
import MenuWhisperAudio
import CorePermissions
import AVFoundation
public class AppController: ObservableObject {
private let logger = Logger(category: "AppController")
// Core components
private let hotkeyManager = HotkeyManager()
private let audioEngine = AudioEngine()
private let permissionManager = PermissionManager()
private let soundManager = SoundManager()
// UI components
private var hudWindow: HUDWindow?
// State management
@Published public private(set) var currentState: AppState = .idle
@Published public var isToggleListening = false
// Dictation timer
private var dictationTimer: Timer?
private let maxDictationDuration: TimeInterval = 600 // 10 minutes default
public init() {
setupDelegates()
setupNotifications()
}
deinit {
cleanup()
}
public func start() {
logger.info("Starting app controller")
// Check microphone permission first
checkMicrophonePermission { [weak self] granted in
if granted {
self?.setupHotkey()
} else {
self?.logger.warning("Microphone permission not granted")
}
}
}
private func setupDelegates() {
hotkeyManager.delegate = self
audioEngine.delegate = self
}
private func setupNotifications() {
NotificationCenter.default.addObserver(
self,
selector: #selector(handleHUDEscape),
name: .hudEscapePressed,
object: nil
)
}
private func setupHotkey() {
hotkeyManager.enableHotkey()
}
private func checkMicrophonePermission(completion: @escaping (Bool) -> Void) {
permissionManager.requestMicrophonePermission { status in
DispatchQueue.main.async {
completion(status == .granted)
}
}
}
@objc private func handleHUDEscape() {
logger.info("HUD escape pressed - cancelling dictation")
cancelDictation()
}
private func startListening() {
guard currentState == .idle else {
logger.warning("Cannot start listening from state: \(currentState)")
return
}
logger.info("Starting listening")
currentState = .listening
do {
try audioEngine.startCapture()
showHUD(state: .listening(level: 0))
startDictationTimer()
soundManager.playStartSound()
} catch {
logger.error("Failed to start audio capture: \(error)")
currentState = .error
soundManager.playErrorSound()
showError("Failed to start microphone: \(error.localizedDescription)")
}
}
private func stopListening() {
guard currentState == .listening else {
logger.warning("Cannot stop listening from state: \(currentState)")
return
}
logger.info("Stopping listening")
stopDictationTimer()
audioEngine.stopCapture()
soundManager.playStopSound()
// Transition to processing state
currentState = .processing
showHUD(state: .processing)
// For Phase 1, we'll just simulate processing and return to idle
// In Phase 2, this is where we'd call the STT engine
DispatchQueue.main.asyncAfter(deadline: .now() + 1.0) {
self.finishProcessing()
}
}
private func finishProcessing() {
logger.info("Finishing processing")
currentState = .idle
hideHUD()
// Reset toggle state if in toggle mode
if hotkeyManager.currentMode == .toggle {
isToggleListening = false
}
}
private func cancelDictation() {
logger.info("Cancelling dictation")
stopDictationTimer()
if audioEngine.isCapturing {
audioEngine.stopCapture()
}
currentState = .idle
hideHUD()
// Reset toggle state
if hotkeyManager.currentMode == .toggle {
isToggleListening = false
}
}
private func startDictationTimer() {
stopDictationTimer() // Clean up any existing timer
dictationTimer = Timer.scheduledTimer(withTimeInterval: maxDictationDuration, repeats: false) { [weak self] _ in
self?.logger.info("Dictation timeout reached")
self?.stopListening()
}
}
private func stopDictationTimer() {
dictationTimer?.invalidate()
dictationTimer = nil
}
private func showHUD(state: HUDState) {
if hudWindow == nil {
hudWindow = HUDWindow()
}
hudWindow?.show(state: state)
}
private func hideHUD() {
hudWindow?.hide()
}
private func showError(_ message: String) {
logger.error("Error: \(message)")
// TODO: Show error dialog in a later phase
currentState = .idle
}
private func cleanup() {
stopDictationTimer()
audioEngine.stopCapture()
hotkeyManager.disableHotkey()
NotificationCenter.default.removeObserver(self)
}
}
// MARK: - HotkeyManagerDelegate
extension AppController: HotkeyManagerDelegate {
public func hotkeyPressed(mode: HotkeyMode, isKeyDown: Bool) {
logger.debug("Hotkey pressed: mode=\(mode), isKeyDown=\(isKeyDown)")
switch mode {
case .pushToTalk:
if isKeyDown {
startListening()
} else {
if currentState == .listening {
stopListening()
}
}
case .toggle:
if isKeyDown { // Only respond to key down in toggle mode
if currentState == .idle && !isToggleListening {
isToggleListening = true
startListening()
} else if currentState == .listening && isToggleListening {
isToggleListening = false
stopListening()
}
}
}
}
}
// MARK: - AudioEngineDelegate
extension AppController: AudioEngineDelegate {
public func audioEngine(_ engine: AudioEngine, didUpdateLevel level: Float) {
// Update HUD with new level
hudWindow?.updateLevel(level)
}
public func audioEngine(_ engine: AudioEngine, didCaptureAudio data: Data) {
logger.info("Audio capture completed: \(data.count) bytes")
// In Phase 2, this is where we'd send the data to STT
}
public func audioEngineDidStartCapture(_ engine: AudioEngine) {
logger.info("Audio engine started capture")
}
public func audioEngineDidStopCapture(_ engine: AudioEngine) {
logger.info("Audio engine stopped capture")
}
}

214
Sources/App/HUDWindow.swift Normal file
View file

@ -0,0 +1,214 @@
import SwiftUI
import AppKit
import CoreUtils
public enum HUDState {
case hidden
case listening(level: Float)
case processing
}
public class HUDWindow: NSPanel {
private var hostingView: NSHostingView<HUDContentView>?
public init() {
super.init(
contentRect: NSRect(x: 0, y: 0, width: 320, height: 160),
styleMask: [.nonactivatingPanel],
backing: .buffered,
defer: false
)
setupWindow()
setupContentView()
}
private func setupWindow() {
level = .floating
isOpaque = false
backgroundColor = NSColor.clear
hasShadow = true
isMovable = false
collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary]
}
private func setupContentView() {
let hudContentView = HUDContentView()
hostingView = NSHostingView(rootView: hudContentView)
if let hostingView = hostingView {
contentView = hostingView
}
}
public func show(state: HUDState) {
centerOnScreen()
if let hostingView = hostingView {
hostingView.rootView.updateState(state)
}
if !isVisible {
orderFront(nil)
alphaValue = 0
NSAnimationContext.runAnimationGroup({ context in
context.duration = 0.2
animator().alphaValue = 1.0
})
}
}
public func hide() {
guard isVisible else { return }
NSAnimationContext.runAnimationGroup({ context in
context.duration = 0.2
animator().alphaValue = 0
}, completionHandler: {
self.orderOut(nil)
})
}
public func updateLevel(_ level: Float) {
if let hostingView = hostingView {
hostingView.rootView.updateState(.listening(level: level))
}
}
private func centerOnScreen() {
guard let screen = NSScreen.main else { return }
let screenFrame = screen.visibleFrame
let windowSize = frame.size
let x = screenFrame.midX - windowSize.width / 2
let y = screenFrame.midY - windowSize.height / 2
setFrameOrigin(NSPoint(x: x, y: y))
}
override public func keyDown(with event: NSEvent) {
if event.keyCode == 53 { // Escape key
NotificationCenter.default.post(name: .hudEscapePressed, object: nil)
return
}
super.keyDown(with: event)
}
override public var canBecomeKey: Bool {
return true // Allow the window to receive key events
}
}
extension Notification.Name {
static let hudEscapePressed = Notification.Name("hudEscapePressed")
}
struct HUDContentView: View {
@State private var currentState: HUDState = .hidden
var body: some View {
ZStack {
RoundedRectangle(cornerRadius: 12)
.fill(.regularMaterial)
.overlay(
RoundedRectangle(cornerRadius: 12)
.stroke(Color.primary.opacity(0.1), lineWidth: 1)
)
VStack(spacing: 16) {
switch currentState {
case .hidden:
EmptyView()
case .listening(let level):
listeningView(level: level)
case .processing:
processingView
}
}
.padding(24)
}
.frame(width: 320, height: 160)
}
@ViewBuilder
private func listeningView(level: Float) -> some View {
VStack(spacing: 12) {
Image(systemName: "mic.fill")
.font(.system(size: 32))
.foregroundColor(.blue)
Text("Listening...")
.font(.headline)
.foregroundColor(.primary)
AudioLevelView(level: level)
.frame(height: 20)
Text("Press Esc to cancel")
.font(.caption)
.foregroundColor(.secondary)
}
}
@ViewBuilder
private var processingView: some View {
VStack(spacing: 12) {
ProgressView()
.scaleEffect(1.2)
Text("Processing...")
.font(.headline)
.foregroundColor(.primary)
Text("Please wait")
.font(.caption)
.foregroundColor(.secondary)
}
}
func updateState(_ state: HUDState) {
withAnimation(.easeInOut(duration: 0.3)) {
currentState = state
}
}
}
struct AudioLevelView: View {
let level: Float
private let barCount = 20
var body: some View {
HStack(spacing: 2) {
ForEach(0..<barCount, id: \.self) { index in
RoundedRectangle(cornerRadius: 1)
.fill(barColor(for: index))
.frame(width: 12, height: barHeight(for: index))
.animation(.easeInOut(duration: 0.1), value: level)
}
}
}
private func barHeight(for index: Int) -> CGFloat {
let threshold = Float(index) / Float(barCount - 1)
return level > threshold ? 20 : 4
}
private func barColor(for index: Int) -> Color {
let threshold = Float(index) / Float(barCount - 1)
if level > threshold {
if threshold < 0.6 {
return .green
} else if threshold < 0.8 {
return .orange
} else {
return .red
}
} else {
return .gray.opacity(0.3)
}
}
}

View file

@ -0,0 +1,152 @@
import Foundation
import AppKit
import Carbon
import CoreUtils
public enum HotkeyMode: String, CaseIterable {
case pushToTalk = "pushToTalk"
case toggle = "toggle"
public var displayName: String {
switch self {
case .pushToTalk:
return NSLocalizedString("hotkey.mode.push", comment: "Push-to-talk mode")
case .toggle:
return NSLocalizedString("hotkey.mode.toggle", comment: "Toggle mode")
}
}
}
public protocol HotkeyManagerDelegate: AnyObject {
func hotkeyPressed(mode: HotkeyMode, isKeyDown: Bool)
}
public class HotkeyManager: ObservableObject {
private let logger = Logger(category: "HotkeyManager")
public weak var delegate: HotkeyManagerDelegate?
@Published public var currentMode: HotkeyMode = .toggle
@Published public var isEnabled: Bool = false
private var hotKeyRef: EventHotKeyRef?
private var eventHandler: EventHandlerRef?
// Default hotkey: V (Command + Shift + V)
private let defaultKeyCode: UInt32 = 9 // V key
private let defaultModifiers: UInt32 = UInt32(cmdKey + shiftKey)
public init() {
setupEventHandler()
}
deinit {
unregisterHotkey()
if let handler = eventHandler {
RemoveEventHandler(handler)
}
}
public func enableHotkey() {
guard !isEnabled else { return }
logger.info("Enabling global hotkey")
let hotKeyID = EventHotKeyID(signature: OSType(0x4D575350), id: 1) // 'MWSP'
let status = RegisterEventHotKey(
defaultKeyCode,
defaultModifiers,
hotKeyID,
GetApplicationEventTarget(),
0,
&hotKeyRef
)
if status == noErr {
isEnabled = true
logger.info("Global hotkey registered successfully")
} else {
logger.error("Failed to register global hotkey: \(status)")
}
}
public func disableHotkey() {
guard isEnabled else { return }
logger.info("Disabling global hotkey")
unregisterHotkey()
isEnabled = false
}
private func unregisterHotkey() {
if let hotKeyRef = hotKeyRef {
UnregisterEventHotKey(hotKeyRef)
self.hotKeyRef = nil
}
}
private func setupEventHandler() {
let eventTypes: [EventTypeSpec] = [
EventTypeSpec(eventClass: OSType(kEventClassKeyboard), eventKind: OSType(kEventHotKeyPressed)),
EventTypeSpec(eventClass: OSType(kEventClassKeyboard), eventKind: OSType(kEventHotKeyReleased))
]
let callback: EventHandlerProcPtr = { (nextHandler, theEvent, userData) -> OSStatus in
guard let userData = userData else { return OSStatus(eventNotHandledErr) }
let manager = Unmanaged<HotkeyManager>.fromOpaque(userData).takeUnretainedValue()
var hotKeyID = EventHotKeyID()
let status = GetEventParameter(
theEvent,
OSType(kEventParamDirectObject),
OSType(typeEventHotKeyID),
nil,
MemoryLayout<EventHotKeyID>.size,
nil,
&hotKeyID
)
guard status == noErr else { return OSStatus(eventNotHandledErr) }
let eventKind = GetEventKind(theEvent)
let isKeyDown = eventKind == OSType(kEventHotKeyPressed)
DispatchQueue.main.async {
manager.handleHotkeyEvent(isKeyDown: isKeyDown)
}
return noErr
}
let selfPtr = Unmanaged.passUnretained(self).toOpaque()
let status = InstallEventHandler(
GetApplicationEventTarget(),
callback,
2,
eventTypes,
selfPtr,
&eventHandler
)
if status != noErr {
logger.error("Failed to install event handler: \(status)")
}
}
private func handleHotkeyEvent(isKeyDown: Bool) {
logger.debug("Hotkey event: \(isKeyDown ? "down" : "up"), mode: \(currentMode)")
switch currentMode {
case .pushToTalk:
// In push-to-talk mode, respond to both key down and up
delegate?.hotkeyPressed(mode: currentMode, isKeyDown: isKeyDown)
case .toggle:
// In toggle mode, only respond to key down
if isKeyDown {
delegate?.hotkeyPressed(mode: currentMode, isKeyDown: true)
}
}
}
}

View file

@ -0,0 +1,64 @@
import SwiftUI
import CoreUtils
@main
struct MenuWhisperApp: App {
@StateObject private var appController = AppController()
var body: some Scene {
MenuBarExtra("Menu-Whisper", systemImage: "mic") {
MenuBarContentView()
.environmentObject(appController)
.onAppear {
appController.start()
}
}
}
}
struct MenuBarContentView: View {
@EnvironmentObject var appController: AppController
var body: some View {
VStack(alignment: .leading, spacing: 4) {
Text("Menu-Whisper")
.font(.headline)
Text(appController.currentState.displayName)
.font(.subheadline)
.foregroundColor(stateColor)
if appController.currentState == .listening {
Text("Press ⌘⇧V or Esc to stop")
.font(.caption)
.foregroundColor(.secondary)
}
Divider()
Button("Preferences...") {
// TODO: Open preferences window in Phase 4
}
Button("Quit") {
NSApplication.shared.terminate(nil)
}
}
.padding(.horizontal, 4)
}
private var stateColor: Color {
switch appController.currentState {
case .idle:
return .primary
case .listening:
return .blue
case .processing:
return .orange
case .injecting:
return .green
case .error:
return .red
}
}
}

View file

@ -0,0 +1,51 @@
import Foundation
import AVFoundation
import AppKit
import CoreUtils
public class SoundManager: ObservableObject {
private let logger = Logger(category: "SoundManager")
@Published public var soundsEnabled: Bool = true
private var startSound: AVAudioPlayer?
private var stopSound: AVAudioPlayer?
public init() {
setupSounds()
}
private func setupSounds() {
// Use system sounds for now
// In a future version, we could bundle custom sound files
setupSystemSounds()
}
private func setupSystemSounds() {
// We'll use NSSound for system sounds since AVAudioPlayer requires files
// These are just placeholders - in a real implementation we'd bundle sound files
logger.info("Sound manager initialized with system sounds")
}
public func playStartSound() {
guard soundsEnabled else { return }
logger.debug("Playing start sound")
// Use a subtle system sound for start
NSSound(named: "Glass")?.play()
}
public func playStopSound() {
guard soundsEnabled else { return }
logger.debug("Playing stop sound")
// Use a different system sound for stop
NSSound(named: "Blow")?.play()
}
public func playErrorSound() {
logger.debug("Playing error sound")
// Always play error sound regardless of settings
NSSound(named: "Funk")?.play()
}
}

View file

@ -1,18 +0,0 @@
import SwiftUI
@main
struct MenuWhisperApp: App {
var body: some Scene {
MenuBarExtra("Menu-Whisper", systemImage: "mic") {
Text("Menu-Whisper")
Text("Idle")
Divider()
Button("Preferences...") {
// TODO: Open preferences
}
Button("Quit") {
NSApplication.shared.terminate(nil)
}
}
}
}

View file

@ -12,6 +12,17 @@ public protocol AudioEngineDelegate: AnyObject {
public class AudioEngine: ObservableObject { public class AudioEngine: ObservableObject {
private let logger = Logger(category: "AudioEngine") private let logger = Logger(category: "AudioEngine")
private let audioEngine = AVAudioEngine() private let audioEngine = AVAudioEngine()
private let inputNode: AVAudioInputNode
private let mixerNode = AVAudioMixerNode()
// Audio format for 16 kHz mono PCM
private let targetFormat = AVAudioFormat(commonFormat: .pcmFormatInt16,
sampleRate: 16000,
channels: 1,
interleaved: false)!
private var capturedData = Data()
private let captureQueue = DispatchQueue(label: "com.menuwhisper.audio.capture", qos: .userInitiated)
public weak var delegate: AudioEngineDelegate? public weak var delegate: AudioEngineDelegate?
@ -19,24 +30,178 @@ public class AudioEngine: ObservableObject {
@Published public private(set) var currentLevel: Float = 0.0 @Published public private(set) var currentLevel: Float = 0.0
public init() { public init() {
// Audio engine initialization will be completed in Phase 1 inputNode = audioEngine.inputNode
setupAudioEngine()
}
deinit {
stopCapture()
}
private func setupAudioEngine() {
// Attach mixer node
audioEngine.attach(mixerNode)
// Get the input format from the microphone
let inputFormat = inputNode.inputFormat(forBus: 0)
logger.info("Input format: \(inputFormat)")
// Connect input node to mixer
audioEngine.connect(inputNode, to: mixerNode, format: inputFormat)
} }
public func startCapture() throws { public func startCapture() throws {
logger.info("Starting audio capture") logger.info("Starting audio capture")
// TODO: Implement in Phase 1
guard !isCapturing else {
logger.warning("Audio capture already in progress")
return
}
// Reset captured data
captureQueue.async {
self.capturedData = Data()
}
// Install tap on the mixer node to capture audio
let inputFormat = inputNode.inputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 4096, format: inputFormat) { [weak self] buffer, time in
self?.processAudioBuffer(buffer)
}
do {
try audioEngine.start()
isCapturing = true isCapturing = true
logger.info("Audio engine started successfully")
delegate?.audioEngineDidStartCapture(self) delegate?.audioEngineDidStartCapture(self)
} catch {
logger.error("Failed to start audio engine: \(error)")
inputNode.removeTap(onBus: 0)
throw error
}
} }
public func stopCapture() { public func stopCapture() {
logger.info("Stopping audio capture") logger.info("Stopping audio capture")
// TODO: Implement in Phase 1
guard isCapturing else {
logger.warning("Audio capture not in progress")
return
}
// Remove tap and stop engine
inputNode.removeTap(onBus: 0)
audioEngine.stop()
isCapturing = false isCapturing = false
currentLevel = 0.0
// Send final captured data to delegate
captureQueue.async {
if !self.capturedData.isEmpty {
DispatchQueue.main.async {
self.delegate?.audioEngine(self, didCaptureAudio: self.capturedData)
}
}
}
delegate?.audioEngineDidStopCapture(self) delegate?.audioEngineDidStopCapture(self)
logger.info("Audio capture stopped")
} }
private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) { private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) {
// TODO: Implement RMS calculation and audio processing in Phase 1 // Calculate RMS level for visualization
let level = calculateRMS(buffer: buffer)
DispatchQueue.main.async {
self.currentLevel = level
self.delegate?.audioEngine(self, didUpdateLevel: level)
}
// Convert to target format (16 kHz mono) if needed
if let convertedBuffer = convertBufferToTargetFormat(buffer) {
captureQueue.async {
self.appendAudioData(from: convertedBuffer)
}
}
}
private func calculateRMS(buffer: AVAudioPCMBuffer) -> Float {
guard let channelData = buffer.floatChannelData,
buffer.frameLength > 0 else {
return 0.0
}
let frameLength = Int(buffer.frameLength)
let samples = channelData[0] // Use first channel
var sum: Float = 0.0
for i in 0..<frameLength {
sum += samples[i] * samples[i]
}
let rms = sqrt(sum / Float(frameLength))
// Convert to dB and normalize to 0-1 range
let db = 20 * log10(max(rms, 0.00001)) // Avoid log(0)
let normalizedLevel = max(0, min(1, (db + 60) / 60)) // Map -60dB to 0dB -> 0 to 1
return normalizedLevel
}
private func convertBufferToTargetFormat(_ inputBuffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
let inputFormat = inputBuffer.format
// If already in target format, return as-is
if inputFormat.sampleRate == targetFormat.sampleRate &&
inputFormat.channelCount == targetFormat.channelCount {
return inputBuffer
}
// Create converter
guard let converter = AVAudioConverter(from: inputFormat, to: targetFormat) else {
logger.error("Failed to create audio converter")
return nil
}
// Calculate output frame capacity
let inputFrameCount = inputBuffer.frameLength
let outputFrameCapacity = AVAudioFrameCount(Double(inputFrameCount) * targetFormat.sampleRate / inputFormat.sampleRate)
// Create output buffer
guard let outputBuffer = AVAudioPCMBuffer(pcmFormat: targetFormat, frameCapacity: outputFrameCapacity) else {
logger.error("Failed to create output buffer")
return nil
}
var error: NSError?
let inputBlock: AVAudioConverterInputBlock = { inNumPackets, outStatus in
outStatus.pointee = .haveData
return inputBuffer
}
converter.convert(to: outputBuffer, error: &error, withInputFrom: inputBlock)
if let error = error {
logger.error("Audio conversion failed: \(error)")
return nil
}
return outputBuffer
}
private func appendAudioData(from buffer: AVAudioPCMBuffer) {
guard let channelData = buffer.int16ChannelData,
buffer.frameLength > 0 else {
return
}
let frameLength = Int(buffer.frameLength)
let samples = channelData[0]
// Convert Int16 samples to Data
let data = Data(bytes: samples, count: frameLength * MemoryLayout<Int16>.size)
capturedData.append(data)
} }
} }

View file

@ -50,16 +50,58 @@ public class PermissionManager: ObservableObject {
} }
} }
public func requestMicrophonePermission(completion: @escaping (PermissionStatus) -> Void) {
logger.info("Requesting microphone permission")
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .authorized:
completion(.granted)
case .denied, .restricted:
completion(.denied)
case .notDetermined:
AVCaptureDevice.requestAccess(for: .audio) { granted in
let status: PermissionStatus = granted ? .granted : .denied
Task { @MainActor in
self.microphoneStatus = status
}
completion(status)
}
@unknown default:
completion(.notDetermined)
}
}
public func requestAccessibilityPermission() { public func requestAccessibilityPermission() {
logger.info("Requesting accessibility permission") logger.info("Requesting accessibility permission")
// TODO: Implement accessibility permission request in Phase 1
// This typically involves guiding the user to System Settings if !AXIsProcessTrusted() {
logger.info("Accessibility permission not granted, opening System Settings")
openSystemSettings(for: .accessibility)
} else {
logger.info("Accessibility permission already granted")
accessibilityStatus = .granted
}
} }
public func requestInputMonitoringPermission() { public func requestInputMonitoringPermission() {
logger.info("Requesting input monitoring permission") logger.info("Requesting input monitoring permission")
// TODO: Implement input monitoring permission request in Phase 1
// This typically involves guiding the user to System Settings // For input monitoring, we can try to detect it by attempting to create a CGEvent
// If it fails, we likely need permission
let testEvent = CGEvent(keyboardEventSource: nil, virtualKey: 0, keyDown: true)
if testEvent == nil {
logger.info("Input monitoring permission likely not granted, opening System Settings")
openSystemSettings(for: .inputMonitoring)
} else {
logger.info("Input monitoring permission appears to be granted")
inputMonitoringStatus = .granted
}
}
public func checkAllPermissions() {
logger.info("Checking all permissions")
refreshAllPermissions()
} }
public func openSystemSettings(for permission: PermissionType) { public func openSystemSettings(for permission: PermissionType) {
@ -100,12 +142,21 @@ public class PermissionManager: ObservableObject {
} }
private func refreshAccessibilityPermission() { private func refreshAccessibilityPermission() {
// TODO: Implement accessibility permission check in Phase 1 if AXIsProcessTrusted() {
accessibilityStatus = .notDetermined accessibilityStatus = .granted
} else {
accessibilityStatus = .denied
}
} }
private func refreshInputMonitoringPermission() { private func refreshInputMonitoringPermission() {
// TODO: Implement input monitoring permission check in Phase 1 // Test if we can create CGEvents (requires Input Monitoring permission)
inputMonitoringStatus = .notDetermined let testEvent = CGEvent(keyboardEventSource: nil, virtualKey: 0, keyDown: true)
if testEvent != nil {
inputMonitoringStatus = .granted
} else {
inputMonitoringStatus = .denied
}
} }
} }

38
TODO.md
View file

@ -54,27 +54,27 @@ Conventions:
**Goal:** Listening UX without real STT. **Goal:** Listening UX without real STT.
### Tasks ### Tasks
- [ ] Implement **global hotkey** manager: - [x] Implement **global hotkey** manager:
- [ ] Default **⌘⇧V** (configurable later). - [x] Default **⌘⇧V** (configurable later).
- [ ] Support **push-to-talk** (start on key down, stop on key up). - [x] Support **push-to-talk** (start on key down, stop on key up).
- [ ] Support **toggle** (press to start, press to stop). - [x] Support **toggle** (press to start, press to stop).
- [ ] Create **HUD** as non-activating centered `NSPanel`: - [x] Create **HUD** as non-activating centered `NSPanel`:
- [ ] State **Listening** with **RMS/peak bars** animation (SwiftUI view). - [x] State **Listening** with **RMS/peak bars** animation (SwiftUI view).
- [ ] State **Processing** with spinner/label. - [x] State **Processing** with spinner/label.
- [ ] Dismiss/cancel with **Esc**. - [x] Dismiss/cancel with **Esc**.
- [ ] Implement **AVAudioEngine** capture: - [x] Implement **AVAudioEngine** capture:
- [ ] Tap on input bus; compute RMS/peak for visualization. - [x] Tap on input bus; compute RMS/peak for visualization.
- [ ] Resample path ready for 16 kHz mono PCM (no STT yet). - [x] Resample path ready for 16 kHz mono PCM (no STT yet).
- [ ] Add dictation **time limit** (default **10 min**, configurable later). - [x] Add dictation **time limit** (default **10 min**, configurable later).
- [ ] Optional **sounds** for start/stop (toggle in settings later). - [x] Optional **sounds** for start/stop (toggle in settings later).
- [ ] Permissions onboarding: - [x] Permissions onboarding:
- [ ] Request **Microphone** permission with Info.plist string. - [x] Request **Microphone** permission with Info.plist string.
- [ ] Show guide for **Accessibility** and **Input Monitoring** (no hard gating yet). - [x] Show guide for **Accessibility** and **Input Monitoring** (no hard gating yet).
### AC ### AC
- [ ] Hotkey works in both modes (push/toggle) across desktop & full-screen apps. - [x] Hotkey works in both modes (push/toggle) across desktop & full-screen apps.
- [ ] HUD appears centered; **Listening** shows live bars; **Processing** shows spinner. - [x] HUD appears centered; **Listening** shows live bars; **Processing** shows spinner.
- [ ] Cancel (Esc) reliably stops listening and hides HUD. - [x] Cancel (Esc) reliably stops listening and hides HUD.
--- ---