Initial commit

This commit is contained in:
Felipe M 2025-09-18 19:56:06 +02:00
commit 1db16227b2
Signed by: fmartingr
GPG key ID: CCFBC5637D4000A8
31 changed files with 2175 additions and 0 deletions

57
.github/workflows/build.yml vendored Normal file
View file

@ -0,0 +1,57 @@
name: Build
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main ]
jobs:
build:
name: Build Menu-Whisper
runs-on: macos-13
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Select Xcode version
run: sudo xcode-select -s /Applications/Xcode_15.0.app/Contents/Developer
- name: Show Xcode version
run: xcodebuild -version
- name: Show Swift version
run: swift --version
- name: Cache Swift Package Manager
uses: actions/cache@v3
with:
path: .build
key: ${{ runner.os }}-spm-${{ hashFiles('**/Package.resolved') }}
restore-keys: |
${{ runner.os }}-spm-
- name: Build with Swift Package Manager
run: swift build -c release
- name: Run tests
run: swift test
- name: Check code formatting (SwiftFormat)
run: |
# Install SwiftFormat if available
if command -v swiftformat >/dev/null 2>&1; then
swiftformat --lint .
else
echo "SwiftFormat not available, skipping format check"
fi
- name: Run SwiftLint
run: |
# Install SwiftLint if available
if command -v swiftlint >/dev/null 2>&1; then
swiftlint
else
echo "SwiftLint not available, skipping lint check"
fi

37
.gitignore vendored Normal file
View file

@ -0,0 +1,37 @@
# Build artifacts
.build/
build/
DerivedData/
# Swift Package Manager
.swiftpm/
Package.resolved
# Xcode
*.xcodeproj/
*.xcworkspace/
xcuserdata/
*.xccheckout
*.moved-aside
# macOS
.DS_Store
.AppleDouble
.LSOverride
# Temporary files
*.tmp
*.swp
*.swo
*~
# Logs
*.log
# IDE
.vscode/
.idea/
# Environment
.env
.env.local

41
.swiftformat Normal file
View file

@ -0,0 +1,41 @@
# SwiftFormat Configuration for Menu-Whisper
# Indentation
--indent 4
--indentcase false
--smarttabs enabled
# Spacing
--spaces 4
--trailingspace ignore
--semicolons never
# Line breaks
--maxwidth 120
--linebreaks lf
--wraparguments preserve
--wraptypealiases preserve
--wrapparameters preserve
--wrapcollections preserve
# Braces
--allman false
--elseposition same-line
# Comments
--stripunusedargs closure-only
# Imports
--importgrouping testable-bottom
# Other formatting options
--redundanttype inferred
--closingparen balanced
--commas inline
--trimwhitespace always
--insertlines enabled
--removelines enabled
--emptybraces no-space
# Disable certain rules that conflict with team preferences
--disable redundantSelf

92
.swiftlint.yml Normal file
View file

@ -0,0 +1,92 @@
# SwiftLint Configuration for Menu-Whisper
# Paths to include/exclude
included:
- Sources
- Tests
excluded:
- Pods
- .build
- DerivedData
# Rules configuration
disabled_rules:
- trailing_comma
- todo
- force_cast
- force_try
opt_in_rules:
- empty_count
- empty_string
- contains_over_first_not_nil
- closure_spacing
- multiline_function_chains
- multiline_literal_brackets
- multiline_parameters
- operator_usage_whitespace
- overridden_super_call
- private_outlet
- prohibited_super_call
- redundant_nil_coalescing
- switch_case_alignment
- unneeded_parentheses_in_closure_argument
- vertical_parameter_alignment_on_call
# Line length
line_length:
warning: 120
error: 140
# File length
file_length:
warning: 500
error: 800
# Function length
function_body_length:
warning: 50
error: 100
# Type body length
type_body_length:
warning: 300
error: 400
# Cyclomatic complexity
cyclomatic_complexity:
warning: 10
error: 20
# Nesting depth
nesting:
type_level: 3
statement_level: 5
# Large tuple
large_tuple:
warning: 3
error: 4
# Identifier names
identifier_name:
min_length:
warning: 2
excluded:
- id
- URL
- url
- x
- y
- z
# Custom rules
custom_rules:
no_print:
name: "No Print Statements"
regex: "print\\("
message: "Use Logger instead of print statements"
severity: warning
reporter: "xcode"

243
Docs/ARCHITECTURE.md Normal file
View file

@ -0,0 +1,243 @@
# Architecture — Menu-Whisper
This document describes the high-level architecture and module organization for Menu-Whisper, a macOS offline speech-to-text application.
## Overview
Menu-Whisper follows a modular architecture with clear separation of concerns between UI, audio processing, speech recognition, text injection, and system integration components.
## System Architecture
```
┌─────────────────────────────────────────────────────────┐
│ App Layer │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
│ │ MenuBarExtra │ │ HUD Panel │ │ Preferences │ │
│ │ (SwiftUI) │ │ (SwiftUI) │ │ (SwiftUI) │ │
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ Core Modules │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
│ │ Audio │ │ STT │ │ Injection │ │
│ │ AVAudioEngine │ │ whisper.cpp │ │ Clipboard │ │
│ │ RMS/Peak │ │ Core ML │ │ Typing │ │
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
│ │ Models │ │ Permissions │ │ Settings │ │
│ │ Management │ │ Microphone │ │ UserDefaults│ │
│ │ Downloads │ │ Accessibility │ │ JSON Export │ │
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ System Integration │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
│ │ Global Hotkeys │ │ Secure Input │ │ Utils │ │
│ │ Carbon │ │ Detection │ │ Helpers │ │
│ │ RegisterHotKey │ │ CGEvent API │ │ │ │
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────┘
```
## Module Descriptions
### App Layer
- **MenuBarExtra**: SwiftUI-based menu bar interface using `MenuBarExtra` for macOS 13+
- **HUD Panel**: Non-activating NSPanel for "Listening" and "Processing" states
- **Preferences**: Settings window with model management, hotkey configuration, etc.
### Core Modules
#### Core/Audio
**Purpose**: Audio capture and real-time processing
- AVAudioEngine integration for microphone input
- Real-time RMS/peak computation for visual feedback
- Audio format conversion (16kHz mono PCM for STT)
- Dictation time limits and session management
#### Core/STT
**Purpose**: Speech-to-text processing with multiple backends
- **WhisperCPP**: Primary backend using whisper.cpp with Metal acceleration
- **CoreML**: Future backend for Core ML models (Phase 6)
- `STTEngine` protocol for backend abstraction
- Language detection and text normalization
#### Core/Models
**Purpose**: Model catalog, downloads, and management
- Curated model catalog (JSON-based)
- Download management with progress tracking
- SHA256 verification and integrity checks
- Local storage in `~/Library/Application Support/MenuWhisper/Models`
- Model selection and metadata management
#### Core/Injection
**Purpose**: Text insertion into focused applications
- Clipboard-based insertion (preferred method)
- Character-by-character typing fallback
- Secure Input detection and handling
- Cross-application compatibility layer
#### Core/Permissions
**Purpose**: System permission management and onboarding
- Microphone access (AVAudioSession)
- Accessibility permissions for text injection
- Input Monitoring permissions for global hotkeys
- Permission status checking and guidance flows
#### Core/Settings
**Purpose**: User preferences and configuration persistence
- UserDefaults-based storage
- JSON export/import functionality
- Settings validation and migration
- Configuration change notifications
### System Integration
#### Global Hotkeys
- Carbon framework integration (`RegisterEventHotKey`)
- Push-to-talk and toggle modes
- Hotkey conflict detection and user guidance
- Cross-application hotkey handling
#### Secure Input Detection
- `IsSecureEventInputEnabled()` monitoring
- Safe fallback behavior (clipboard-only)
- User notification for secure contexts
#### Utils
- Shared utilities and helper functions
- Logging infrastructure (opt-in local logs)
- Error handling and user feedback
## Data Flow
### Main Operational Flow
```
User Hotkey → Audio Capture → STT Processing → Text Injection
▲ │ │ │
│ ▼ ▼ ▼
Hotkey Mgr Audio Buffer Model Engine Injection Mgr
│ RMS/Peak whisper.cpp Clipboard/Type
│ │ │ │
▼ ▼ ▼ ▼
HUD UI Visual Feedback Processing UI Target App
```
### State Management
The application follows a finite state machine pattern:
- **Idle**: Waiting for user input
- **Listening**: Capturing audio with visual feedback
- **Processing**: Running STT inference
- **Injecting**: Inserting text into target application
- **Error**: Handling and displaying errors
## Finite State Machine
```
┌─────────────┐
│ Idle │◄─────────────┐
└─────────────┘ │
│ │
│ Hotkey Press │ Success/Error
▼ │
┌─────────────┐ │
│ Listening │ │
└─────────────┘ │
│ │
│ Stop/Timeout │
▼ │
┌─────────────┐ │
│ Processing │ │
└─────────────┘ │
│ │
│ STT Complete │
▼ │
┌─────────────┐ │
│ Injecting │──────────────┘
└─────────────┘
```
## Technology Stack
### Core Technologies
- **Swift 5.9+**: Primary development language
- **SwiftUI**: User interface framework
- **AppKit**: macOS-specific UI components (NSStatusItem, NSPanel)
- **AVFoundation**: Audio capture and processing
- **Carbon**: Global hotkey registration
### External Dependencies
- **whisper.cpp**: C/C++ speech recognition engine with Metal support
- **Swift Package Manager**: Dependency management and build system
### Platform Integration
- **UserDefaults**: Settings persistence
- **NSPasteboard**: Clipboard operations
- **CGEvent**: Low-level input simulation
- **URLSession**: Model downloads
## Build System
The project uses Swift Package Manager with modular targets:
```
MenuWhisper/
├── Package.swift # SPM configuration
├── Sources/
│ ├── App/ # Main application target
│ ├── CoreAudio/ # Audio processing module
│ ├── CoreSTT/ # Speech-to-text engines
│ ├── CoreModels/ # Model management
│ ├── CoreInjection/ # Text insertion
│ ├── CorePermissions/ # System permissions
│ ├── CoreSettings/ # User preferences
│ └── CoreUtils/ # Shared utilities
├── Resources/ # Assets, localizations
└── Tests/ # Unit and integration tests
```
## Security Considerations
### Privacy
- All audio processing occurs locally
- No telemetry or data collection
- Optional local logging with user consent
### System Security
- Respects Secure Input contexts
- Requires explicit user permission grants
- Code signing and notarization for distribution
### Input Safety
- Validates all user inputs
- Safe handling of special characters in typing mode
- Proper escaping for different keyboard layouts
## Performance Characteristics
### Target Metrics
- **Latency**: <4s additional processing time for 10s audio (M1 + small model)
- **Memory**: ~1.5-2.5GB with small model
- **Model Loading**: Lazy loading with warm cache
- **UI Responsiveness**: Non-blocking background processing
### Optimization Strategies
- Metal acceleration for STT inference
- Efficient audio buffering and streaming
- Model reuse across dictation sessions
- Configurable threading for CPU-intensive operations
## Future Extensibility
The modular architecture supports future enhancements:
- Additional STT backends (Core ML, cloud services)
- Voice Activity Detection (VAD)
- Advanced audio preprocessing
- Custom insertion rules per application
- Plugin architecture for text processing
This architecture provides a solid foundation for the MVP while maintaining flexibility for future feature additions and platform evolution.

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Menu-Whisper
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

123
Package.swift Normal file
View file

@ -0,0 +1,123 @@
// swift-tools-version: 5.9
import PackageDescription
let package = Package(
name: "MenuWhisper",
platforms: [
.macOS(.v13)
],
products: [
.executable(
name: "MenuWhisper",
targets: ["App"]
)
],
dependencies: [
// Add external dependencies here as needed
// Example: .package(url: "...", from: "1.0.0")
],
targets: [
// Main Application Target
.executableTarget(
name: "App",
dependencies: [
"MenuWhisperAudio",
"CoreSTT",
"CoreModels",
"CoreInjection",
"CorePermissions",
"CoreSettings",
"CoreUtils"
],
path: "Sources/App",
resources: [
.copy("../../Resources")
]
),
// Core Module Targets
.target(
name: "MenuWhisperAudio",
dependencies: ["CoreUtils"],
path: "Sources/CoreAudio"
),
.target(
name: "CoreSTT",
dependencies: ["CoreUtils", "CoreModels", "MenuWhisperAudio"],
path: "Sources/CoreSTT"
),
.target(
name: "CoreModels",
dependencies: ["CoreUtils"],
path: "Sources/CoreModels"
),
.target(
name: "CoreInjection",
dependencies: ["CoreUtils"],
path: "Sources/CoreInjection"
),
.target(
name: "CorePermissions",
dependencies: ["CoreUtils"],
path: "Sources/CorePermissions"
),
.target(
name: "CoreSettings",
dependencies: ["CoreUtils"],
path: "Sources/CoreSettings"
),
.target(
name: "CoreUtils",
path: "Sources/CoreUtils"
),
// Test Targets
.testTarget(
name: "MenuWhisperAudioTests",
dependencies: ["MenuWhisperAudio"],
path: "Tests/CoreAudioTests"
),
.testTarget(
name: "CoreSTTTests",
dependencies: ["CoreSTT"],
path: "Tests/CoreSTTTests"
),
.testTarget(
name: "CoreModelsTests",
dependencies: ["CoreModels"],
path: "Tests/CoreModelsTests"
),
.testTarget(
name: "CoreInjectionTests",
dependencies: ["CoreInjection"],
path: "Tests/CoreInjectionTests"
),
.testTarget(
name: "CorePermissionsTests",
dependencies: ["CorePermissions"],
path: "Tests/CorePermissionsTests"
),
.testTarget(
name: "CoreSettingsTests",
dependencies: ["CoreSettings"],
path: "Tests/CoreSettingsTests"
),
.testTarget(
name: "CoreUtilsTests",
dependencies: ["CoreUtils"],
path: "Tests/CoreUtilsTests"
)
]
)

96
README.md Normal file
View file

@ -0,0 +1,96 @@
# Menu-Whisper
A macOS menu bar application that provides offline speech-to-text transcription using Whisper-family models and automatically inserts the transcribed text into the currently focused application.
## Overview
Menu-Whisper is designed to be a privacy-focused, offline-first speech recognition tool for macOS. It runs entirely locally on Apple Silicon machines, requiring no internet connection during normal operation (only for initial model downloads).
### Key Features
- **100% Offline Operation**: Audio and text never leave your device
- **Apple Silicon Optimized**: Built specifically for M1/M2/M3 processors with Metal acceleration
- **Global Hotkey Support**: Default ⌘⇧V (configurable)
- **Smart Text Insertion**: Clipboard paste with typing fallback
- **Secure Input Detection**: Respects password fields and secure contexts
- **Multiple Models**: Support for various Whisper model sizes and variants
- **Multilingual**: Spanish and English interface and recognition
## Requirements
- **macOS**: 13.0 (Ventura) or later
- **Hardware**: Apple Silicon (M1, M2, or M3 processor) - Intel Macs are not supported
- **Xcode**: 15.0+ for building from source
- **Permissions**: Microphone, Accessibility, and Input Monitoring access
## Build Requirements
### Development Environment
- macOS 13+ with Xcode 15.0+
- Swift 5.9+
- Swift Package Manager (included with Xcode)
### System Dependencies
- AVFoundation framework (audio capture)
- Carbon framework (global hotkeys)
- AppKit/SwiftUI (UI components)
### Third-party Dependencies
- whisper.cpp (C/C++ library for speech recognition with Metal support)
## Installation
**Note**: This project is currently in development. Pre-built binaries will be available as signed and notarized .dmg files once complete.
### Building from Source
1. Clone the repository:
```bash
git clone <repository-url>
cd tellme
```
2. Open the project in Xcode or use Swift Package Manager:
```bash
swift build -c release
```
3. For development, open `Package.swift` in Xcode.
## Architecture
The application is structured with modular components:
- **App**: SwiftUI interface with AppKit bridges
- **Core/Audio**: AVAudioEngine capture and processing
- **Core/STT**: Speech-to-text engines (whisper.cpp, future Core ML)
- **Core/Models**: Model management and downloads
- **Core/Injection**: Text insertion with secure input handling
- **Core/Permissions**: System permission management
- **Core/Settings**: User preferences and configuration
## Privacy & Security
- **No Telemetry**: Zero data collection or remote analytics
- **Local Processing**: All audio processing happens on-device
- **Secure Input Respect**: Automatically detects and respects secure input contexts
- **Permission-Based**: Requires explicit user consent for system access
## Development Status
This project is currently in active development following a phased approach:
- Phase 0: Project scaffolding ⬅️ **Current**
- Phase 1: Hotkey + HUD + Audio capture
- Phase 2: STT integration with whisper.cpp
- Phase 3: Text insertion system
- Phase 4: Preferences and UX polish
- Phase 5: Distribution and packaging
See `TODO.md` for detailed development progress and `TECHSPEC.md` for complete technical specifications.
## License
MIT License - see [LICENSE](LICENSE) for details.
## Contributing
This project follows a structured development approach with clear phases and acceptance criteria. Please refer to the technical specification and TODO list before contributing.

View file

@ -0,0 +1,77 @@
/* Menu-Whisper - English Localization */
/* General */
"app.name" = "Menu-Whisper";
"general.ok" = "OK";
"general.cancel" = "Cancel";
"general.continue" = "Continue";
"general.settings" = "Settings";
"general.quit" = "Quit";
/* Menu Bar */
"menubar.idle" = "Idle";
"menubar.listening" = "Listening";
"menubar.processing" = "Processing";
"menubar.preferences" = "Preferences...";
"menubar.quit" = "Quit Menu-Whisper";
/* HUD States */
"hud.listening" = "Listening...";
"hud.processing" = "Transcribing...";
"hud.cancel" = "Press Esc to cancel";
/* Permissions */
"permissions.microphone.title" = "Microphone Access Required";
"permissions.microphone.message" = "Menu-Whisper needs access to your microphone to perform speech-to-text transcription.";
"permissions.accessibility.title" = "Accessibility Access Required";
"permissions.accessibility.message" = "Menu-Whisper needs Accessibility access to insert transcribed text into applications.";
"permissions.input_monitoring.title" = "Input Monitoring Required";
"permissions.input_monitoring.message" = "Menu-Whisper needs Input Monitoring access to register global hotkeys.";
"permissions.open_settings" = "Open System Settings";
/* Preferences Window */
"preferences.title" = "Menu-Whisper Preferences";
"preferences.general" = "General";
"preferences.models" = "Models";
"preferences.hotkeys" = "Hotkeys";
"preferences.insertion" = "Text Insertion";
"preferences.advanced" = "Advanced";
/* General Preferences */
"preferences.general.hotkey" = "Global Hotkey:";
"preferences.general.mode" = "Activation Mode:";
"preferences.general.mode.push_to_talk" = "Push-to-talk";
"preferences.general.mode.toggle" = "Toggle";
"preferences.general.sounds" = "Play sounds for start/stop";
"preferences.general.limit" = "Dictation time limit (minutes):";
/* Model Preferences */
"preferences.models.title" = "Speech Recognition Models";
"preferences.models.active" = "Active Model:";
"preferences.models.language" = "Language:";
"preferences.models.language.auto" = "Auto-detect";
"preferences.models.download" = "Download";
"preferences.models.delete" = "Delete";
"preferences.models.size" = "Size:";
"preferences.models.languages" = "Languages:";
/* Insertion Preferences */
"preferences.insertion.method" = "Insertion Method:";
"preferences.insertion.method.paste" = "Paste (⌘V)";
"preferences.insertion.method.type" = "Type characters";
"preferences.insertion.preview" = "Show preview before inserting";
"preferences.insertion.secure_input" = "Secure Input Detected";
"preferences.insertion.secure_input.message" = "Text insertion is disabled in secure contexts. Text has been copied to clipboard.";
/* Errors */
"error.audio.failed" = "Failed to access microphone";
"error.model.not_found" = "Speech recognition model not found";
"error.model.load_failed" = "Failed to load speech recognition model";
"error.transcription.failed" = "Speech transcription failed";
"error.download.failed" = "Model download failed";
"error.download.verification_failed" = "Model verification failed";
/* Success Messages */
"success.model.downloaded" = "Model downloaded successfully";
"success.settings.exported" = "Settings exported successfully";
"success.settings.imported" = "Settings imported successfully";

View file

@ -0,0 +1,77 @@
/* Menu-Whisper - Spanish Localization */
/* General */
"app.name" = "Menu-Whisper";
"general.ok" = "Aceptar";
"general.cancel" = "Cancelar";
"general.continue" = "Continuar";
"general.settings" = "Configuración";
"general.quit" = "Salir";
/* Menu Bar */
"menubar.idle" = "Inactivo";
"menubar.listening" = "Escuchando";
"menubar.processing" = "Procesando";
"menubar.preferences" = "Preferencias...";
"menubar.quit" = "Salir de Menu-Whisper";
/* HUD States */
"hud.listening" = "Escuchando...";
"hud.processing" = "Transcribiendo...";
"hud.cancel" = "Presiona Esc para cancelar";
/* Permissions */
"permissions.microphone.title" = "Acceso al Micrófono Requerido";
"permissions.microphone.message" = "Menu-Whisper necesita acceso a tu micrófono para realizar la transcripción de voz a texto.";
"permissions.accessibility.title" = "Acceso de Accesibilidad Requerido";
"permissions.accessibility.message" = "Menu-Whisper necesita acceso de Accesibilidad para insertar texto transcrito en aplicaciones.";
"permissions.input_monitoring.title" = "Monitoreo de Entrada Requerido";
"permissions.input_monitoring.message" = "Menu-Whisper necesita acceso de Monitoreo de Entrada para registrar atajos de teclado globales.";
"permissions.open_settings" = "Abrir Configuración del Sistema";
/* Preferences Window */
"preferences.title" = "Preferencias de Menu-Whisper";
"preferences.general" = "General";
"preferences.models" = "Modelos";
"preferences.hotkeys" = "Atajos";
"preferences.insertion" = "Inserción de Texto";
"preferences.advanced" = "Avanzado";
/* General Preferences */
"preferences.general.hotkey" = "Atajo Global:";
"preferences.general.mode" = "Modo de Activación:";
"preferences.general.mode.push_to_talk" = "Presionar para hablar";
"preferences.general.mode.toggle" = "Alternar";
"preferences.general.sounds" = "Reproducir sonidos al iniciar/detener";
"preferences.general.limit" = "Límite de tiempo de dictado (minutos):";
/* Model Preferences */
"preferences.models.title" = "Modelos de Reconocimiento de Voz";
"preferences.models.active" = "Modelo Activo:";
"preferences.models.language" = "Idioma:";
"preferences.models.language.auto" = "Detección automática";
"preferences.models.download" = "Descargar";
"preferences.models.delete" = "Eliminar";
"preferences.models.size" = "Tamaño:";
"preferences.models.languages" = "Idiomas:";
/* Insertion Preferences */
"preferences.insertion.method" = "Método de Inserción:";
"preferences.insertion.method.paste" = "Pegar (⌘V)";
"preferences.insertion.method.type" = "Escribir caracteres";
"preferences.insertion.preview" = "Mostrar vista previa antes de insertar";
"preferences.insertion.secure_input" = "Entrada Segura Detectada";
"preferences.insertion.secure_input.message" = "La inserción de texto está deshabilitada en contextos seguros. El texto se ha copiado al portapapeles.";
/* Errors */
"error.audio.failed" = "Error al acceder al micrófono";
"error.model.not_found" = "Modelo de reconocimiento de voz no encontrado";
"error.model.load_failed" = "Error al cargar el modelo de reconocimiento de voz";
"error.transcription.failed" = "Error en la transcripción de voz";
"error.download.failed" = "Error en la descarga del modelo";
"error.download.verification_failed" = "Error en la verificación del modelo";
/* Success Messages */
"success.model.downloaded" = "Modelo descargado exitosamente";
"success.settings.exported" = "Configuración exportada exitosamente";
"success.settings.imported" = "Configuración importada exitosamente";

38
Scripts/build.sh Executable file
View file

@ -0,0 +1,38 @@
#!/bin/bash
# Build script for Menu-Whisper
# This script builds the project using Swift Package Manager
set -e
echo "🔨 Building Menu-Whisper..."
# Clean previous build
echo "🧹 Cleaning previous build..."
swift package clean
# Build in release mode
echo "⚡ Building in release mode..."
swift build -c release
# Run tests
echo "🧪 Running tests..."
swift test
# Check if SwiftFormat is available and run it
if command -v swiftformat >/dev/null 2>&1; then
echo "📝 Checking code formatting..."
swiftformat --lint .
else
echo "⚠️ SwiftFormat not available, skipping format check"
fi
# Check if SwiftLint is available and run it
if command -v swiftlint >/dev/null 2>&1; then
echo "🔍 Running SwiftLint..."
swiftlint
else
echo "⚠️ SwiftLint not available, skipping lint check"
fi
echo "✅ Build completed successfully!"

35
Scripts/notarize.sh Executable file
View file

@ -0,0 +1,35 @@
#!/bin/bash
# Notarization script for Menu-Whisper
# This is a placeholder script that will be completed in Phase 5
set -e
echo "🍎 Menu-Whisper Notarization Script"
echo "📋 This script will handle code signing and notarization for distribution"
echo ""
echo "⚠️ This is a placeholder script - implementation pending Phase 5"
echo ""
echo "📝 Steps that will be implemented:"
echo " 1. Code signing with Developer ID"
echo " 2. Creating .app bundle"
echo " 3. Notarization with Apple"
echo " 4. Stapling notarization ticket"
echo " 5. Creating .dmg for distribution"
echo ""
echo "🔧 Usage (when implemented):"
echo " ./Scripts/notarize.sh [--developer-id YOUR_TEAM_ID]"
echo ""
# Placeholder for future implementation
# TODO: Implement in Phase 5
# - Set up code signing identity
# - Configure entitlements
# - Build .app bundle
# - Submit for notarization
# - Wait for approval
# - Staple ticket
# - Create DMG
echo "❌ Not implemented yet - use in Phase 5"
exit 1

18
Sources/App/main.swift Normal file
View file

@ -0,0 +1,18 @@
import SwiftUI
@main
struct MenuWhisperApp: App {
var body: some Scene {
MenuBarExtra("Menu-Whisper", systemImage: "mic") {
Text("Menu-Whisper")
Text("Idle")
Divider()
Button("Preferences...") {
// TODO: Open preferences
}
Button("Quit") {
NSApplication.shared.terminate(nil)
}
}
}
}

View file

@ -0,0 +1,42 @@
import Foundation
import AVFoundation
import CoreUtils
public protocol AudioEngineDelegate: AnyObject {
func audioEngine(_ engine: AudioEngine, didUpdateLevel level: Float)
func audioEngine(_ engine: AudioEngine, didCaptureAudio data: Data)
func audioEngineDidStartCapture(_ engine: AudioEngine)
func audioEngineDidStopCapture(_ engine: AudioEngine)
}
public class AudioEngine: ObservableObject {
private let logger = Logger(category: "AudioEngine")
private let audioEngine = AVAudioEngine()
public weak var delegate: AudioEngineDelegate?
@Published public private(set) var isCapturing = false
@Published public private(set) var currentLevel: Float = 0.0
public init() {
// Audio engine initialization will be completed in Phase 1
}
public func startCapture() throws {
logger.info("Starting audio capture")
// TODO: Implement in Phase 1
isCapturing = true
delegate?.audioEngineDidStartCapture(self)
}
public func stopCapture() {
logger.info("Stopping audio capture")
// TODO: Implement in Phase 1
isCapturing = false
delegate?.audioEngineDidStopCapture(self)
}
private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) {
// TODO: Implement RMS calculation and audio processing in Phase 1
}
}

View file

@ -0,0 +1,73 @@
import Foundation
import AppKit
import CoreUtils
public enum InjectionMethod {
case paste
case typing
}
public enum InjectionError: Error, LocalizedError {
case secureInputActive
case accessibilityPermissionRequired
case injectionFailed(String)
public var errorDescription: String? {
switch self {
case .secureInputActive:
return NSLocalizedString("preferences.insertion.secure_input.message", comment: "Secure input message")
case .accessibilityPermissionRequired:
return NSLocalizedString("permissions.accessibility.message", comment: "Accessibility permission message")
case .injectionFailed(let reason):
return "Text injection failed: \(reason)"
}
}
}
public class TextInjector {
private let logger = Logger(category: "TextInjector")
public init() {}
public func injectText(_ text: String, method: InjectionMethod = .paste) throws {
logger.info("Injecting text using method: \(method)")
// Check for secure input first
if isSecureInputActive() {
// Copy to clipboard but don't inject
copyToClipboard(text)
throw InjectionError.secureInputActive
}
switch method {
case .paste:
try injectViaPaste(text)
case .typing:
try injectViaTyping(text)
}
}
private func injectViaPaste(_ text: String) throws {
logger.debug("Injecting text via paste method")
// TODO: Implement paste injection (clipboard + V) in Phase 3
copyToClipboard(text)
// TODO: Send V via CGEvent
}
private func injectViaTyping(_ text: String) throws {
logger.debug("Injecting text via typing method")
// TODO: Implement character-by-character typing via CGEvent in Phase 3
}
private func copyToClipboard(_ text: String) {
let pasteboard = NSPasteboard.general
pasteboard.clearContents()
pasteboard.setString(text, forType: .string)
logger.debug("Text copied to clipboard")
}
private func isSecureInputActive() -> Bool {
// TODO: Implement IsSecureEventInputEnabled() check in Phase 3
return false
}
}

View file

@ -0,0 +1,70 @@
import Foundation
import CoreUtils
public struct ModelInfo: Codable, Identifiable {
public let id = UUID()
public let name: String
public let family: String
public let format: String
public let sizeMB: Int
public let languages: [String]
public let recommendedBackend: String
public let qualityTier: String
public let license: String
public let sha256: String
public let downloadURL: String
public let notes: String
enum CodingKeys: String, CodingKey {
case name, family, format, languages, license, sha256, notes
case sizeMB = "size_mb"
case recommendedBackend = "recommended_backend"
case qualityTier = "quality_tier"
case downloadURL = "download_url"
}
}
public class ModelManager: ObservableObject {
private let logger = Logger(category: "ModelManager")
@Published public private(set) var availableModels: [ModelInfo] = []
@Published public private(set) var downloadedModels: [ModelInfo] = []
@Published public private(set) var activeModel: ModelInfo?
private let modelsDirectory: URL
public init() {
let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first!
modelsDirectory = appSupport.appendingPathComponent("MenuWhisper/Models")
try? FileManager.default.createDirectory(at: modelsDirectory, withIntermediateDirectories: true)
loadModelCatalog()
refreshDownloadedModels()
}
public func downloadModel(_ model: ModelInfo) async throws {
logger.info("Starting download for model: \(model.name)")
// TODO: Implement model download with progress tracking and SHA256 verification in Phase 2
}
public func deleteModel(_ model: ModelInfo) throws {
logger.info("Deleting model: \(model.name)")
// TODO: Implement model deletion in Phase 2
}
public func setActiveModel(_ model: ModelInfo) {
logger.info("Setting active model: \(model.name)")
activeModel = model
// TODO: Persist active model selection in Phase 2
}
private func loadModelCatalog() {
// TODO: Load curated model catalog from bundled JSON in Phase 2
logger.info("Loading model catalog")
}
private func refreshDownloadedModels() {
// TODO: Scan models directory and populate downloadedModels in Phase 2
logger.info("Refreshing downloaded models")
}
}

View file

@ -0,0 +1,111 @@
import Foundation
import AVFoundation
import AppKit
import CoreUtils
public enum PermissionType: CaseIterable {
case microphone
case accessibility
case inputMonitoring
}
public enum PermissionStatus {
case notDetermined
case granted
case denied
case restricted
}
public class PermissionManager: ObservableObject {
private let logger = Logger(category: "PermissionManager")
@Published public private(set) var microphoneStatus: PermissionStatus = .notDetermined
@Published public private(set) var accessibilityStatus: PermissionStatus = .notDetermined
@Published public private(set) var inputMonitoringStatus: PermissionStatus = .notDetermined
public init() {
refreshAllPermissions()
}
public func requestMicrophonePermission() async -> PermissionStatus {
logger.info("Requesting microphone permission")
return await withCheckedContinuation { continuation in
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .authorized:
continuation.resume(returning: .granted)
case .denied, .restricted:
continuation.resume(returning: .denied)
case .notDetermined:
AVCaptureDevice.requestAccess(for: .audio) { granted in
let status: PermissionStatus = granted ? .granted : .denied
Task { @MainActor in
self.microphoneStatus = status
}
continuation.resume(returning: status)
}
@unknown default:
continuation.resume(returning: .notDetermined)
}
}
}
public func requestAccessibilityPermission() {
logger.info("Requesting accessibility permission")
// TODO: Implement accessibility permission request in Phase 1
// This typically involves guiding the user to System Settings
}
public func requestInputMonitoringPermission() {
logger.info("Requesting input monitoring permission")
// TODO: Implement input monitoring permission request in Phase 1
// This typically involves guiding the user to System Settings
}
public func openSystemSettings(for permission: PermissionType) {
logger.info("Opening system settings for permission: \(permission)")
let urlString: String
switch permission {
case .microphone:
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Microphone"
case .accessibility:
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility"
case .inputMonitoring:
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_ListenEvent"
}
if let url = URL(string: urlString) {
NSWorkspace.shared.open(url)
}
}
private func refreshAllPermissions() {
refreshMicrophonePermission()
refreshAccessibilityPermission()
refreshInputMonitoringPermission()
}
private func refreshMicrophonePermission() {
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .notDetermined:
microphoneStatus = .notDetermined
case .authorized:
microphoneStatus = .granted
case .denied, .restricted:
microphoneStatus = .denied
@unknown default:
microphoneStatus = .notDetermined
}
}
private func refreshAccessibilityPermission() {
// TODO: Implement accessibility permission check in Phase 1
accessibilityStatus = .notDetermined
}
private func refreshInputMonitoringPermission() {
// TODO: Implement input monitoring permission check in Phase 1
inputMonitoringStatus = .notDetermined
}
}

View file

@ -0,0 +1,32 @@
import Foundation
import CoreUtils
public protocol STTEngine {
func transcribe(audioData: Data, language: String?) async throws -> String
func isModelLoaded() -> Bool
func loadModel(at path: URL) async throws
func unloadModel()
}
public enum STTError: Error, LocalizedError {
case modelNotFound
case modelLoadFailed(String)
case transcriptionFailed(String)
case unsupportedFormat
case invalidAudioData
public var errorDescription: String? {
switch self {
case .modelNotFound:
return NSLocalizedString("error.model.not_found", comment: "Model not found error")
case .modelLoadFailed(let reason):
return NSLocalizedString("error.model.load_failed", comment: "Model load failed error") + ": \(reason)"
case .transcriptionFailed(let reason):
return NSLocalizedString("error.transcription.failed", comment: "Transcription failed error") + ": \(reason)"
case .unsupportedFormat:
return "Unsupported audio format"
case .invalidAudioData:
return "Invalid audio data"
}
}
}

View file

@ -0,0 +1,35 @@
import Foundation
import CoreUtils
public class WhisperCPPEngine: STTEngine {
private let logger = Logger(category: "WhisperCPPEngine")
private var modelPath: URL?
private var isLoaded = false
public init() {
// WhisperCPP integration will be implemented in Phase 2
}
public func transcribe(audioData: Data, language: String?) async throws -> String {
logger.info("Transcribing audio data")
// TODO: Implement whisper.cpp integration in Phase 2
throw STTError.transcriptionFailed("Not implemented yet")
}
public func isModelLoaded() -> Bool {
return isLoaded
}
public func loadModel(at path: URL) async throws {
logger.info("Loading model at path: \(path.path)")
self.modelPath = path
// TODO: Implement model loading in Phase 2
isLoaded = true
}
public func unloadModel() {
logger.info("Unloading model")
modelPath = nil
isLoaded = false
}
}

View file

@ -0,0 +1,149 @@
import Foundation
import CoreUtils
public enum HotkeyMode: String, CaseIterable, Codable {
case pushToTalk = "push_to_talk"
case toggle = "toggle"
public var displayName: String {
switch self {
case .pushToTalk:
return NSLocalizedString("preferences.general.mode.push_to_talk", comment: "Push to talk mode")
case .toggle:
return NSLocalizedString("preferences.general.mode.toggle", comment: "Toggle mode")
}
}
}
public struct HotkeyConfig: Codable {
public let keyCode: UInt32
public let modifiers: UInt32
public init(keyCode: UInt32, modifiers: UInt32) {
self.keyCode = keyCode
self.modifiers = modifiers
}
// Default to V
public static let `default` = HotkeyConfig(keyCode: 9, modifiers: 768) // V key with Cmd+Shift
}
public class Settings: ObservableObject {
private let logger = Logger(category: "Settings")
private let userDefaults = UserDefaults.standard
// General Settings
@Published public var hotkey: HotkeyConfig {
didSet { saveHotkey() }
}
@Published public var hotkeyMode: HotkeyMode {
didSet { saveHotkeyMode() }
}
@Published public var playSounds: Bool {
didSet { userDefaults.set(playSounds, forKey: "playSounds") }
}
@Published public var dictationTimeLimit: TimeInterval {
didSet { userDefaults.set(dictationTimeLimit, forKey: "dictationTimeLimit") }
}
// Model Settings
@Published public var activeModelName: String? {
didSet { userDefaults.set(activeModelName, forKey: "activeModelName") }
}
@Published public var forcedLanguage: String? {
didSet { userDefaults.set(forcedLanguage, forKey: "forcedLanguage") }
}
// Insertion Settings
@Published public var insertionMethod: String {
didSet { userDefaults.set(insertionMethod, forKey: "insertionMethod") }
}
@Published public var showPreview: Bool {
didSet { userDefaults.set(showPreview, forKey: "showPreview") }
}
public init() {
// Load settings from UserDefaults
self.hotkey = Settings.loadHotkey()
self.hotkeyMode = HotkeyMode(rawValue: userDefaults.string(forKey: "hotkeyMode") ?? "") ?? .pushToTalk
self.playSounds = userDefaults.object(forKey: "playSounds") as? Bool ?? false
self.dictationTimeLimit = userDefaults.object(forKey: "dictationTimeLimit") as? TimeInterval ?? 600 // 10 minutes
self.activeModelName = userDefaults.string(forKey: "activeModelName")
self.forcedLanguage = userDefaults.string(forKey: "forcedLanguage")
self.insertionMethod = userDefaults.string(forKey: "insertionMethod") ?? "paste"
self.showPreview = userDefaults.object(forKey: "showPreview") as? Bool ?? false
logger.info("Settings initialized")
}
public func exportSettings() throws -> Data {
let settingsDict: [String: Any] = [
"hotkeyKeyCode": hotkey.keyCode,
"hotkeyModifiers": hotkey.modifiers,
"hotkeyMode": hotkeyMode.rawValue,
"playSounds": playSounds,
"dictationTimeLimit": dictationTimeLimit,
"activeModelName": activeModelName as Any,
"forcedLanguage": forcedLanguage as Any,
"insertionMethod": insertionMethod,
"showPreview": showPreview
]
return try JSONSerialization.data(withJSONObject: settingsDict, options: .prettyPrinted)
}
public func importSettings(from data: Data) throws {
let settingsDict = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
if let keyCode = settingsDict["hotkeyKeyCode"] as? UInt32,
let modifiers = settingsDict["hotkeyModifiers"] as? UInt32 {
hotkey = HotkeyConfig(keyCode: keyCode, modifiers: modifiers)
}
if let modeString = settingsDict["hotkeyMode"] as? String,
let mode = HotkeyMode(rawValue: modeString) {
hotkeyMode = mode
}
if let sounds = settingsDict["playSounds"] as? Bool {
playSounds = sounds
}
if let timeLimit = settingsDict["dictationTimeLimit"] as? TimeInterval {
dictationTimeLimit = timeLimit
}
activeModelName = settingsDict["activeModelName"] as? String
forcedLanguage = settingsDict["forcedLanguage"] as? String
if let method = settingsDict["insertionMethod"] as? String {
insertionMethod = method
}
if let preview = settingsDict["showPreview"] as? Bool {
showPreview = preview
}
logger.info("Settings imported successfully")
}
private static func loadHotkey() -> HotkeyConfig {
let keyCode = UserDefaults.standard.object(forKey: "hotkeyKeyCode") as? UInt32 ?? HotkeyConfig.default.keyCode
let modifiers = UserDefaults.standard.object(forKey: "hotkeyModifiers") as? UInt32 ?? HotkeyConfig.default.modifiers
return HotkeyConfig(keyCode: keyCode, modifiers: modifiers)
}
private func saveHotkey() {
userDefaults.set(hotkey.keyCode, forKey: "hotkeyKeyCode")
userDefaults.set(hotkey.modifiers, forKey: "hotkeyModifiers")
}
private func saveHotkeyMode() {
userDefaults.set(hotkeyMode.rawValue, forKey: "hotkeyMode")
}
}

View file

@ -0,0 +1,24 @@
import Foundation
public enum AppState: String, CaseIterable {
case idle = "idle"
case listening = "listening"
case processing = "processing"
case injecting = "injecting"
case error = "error"
public var displayName: String {
switch self {
case .idle:
return NSLocalizedString("menubar.idle", comment: "Idle state")
case .listening:
return NSLocalizedString("menubar.listening", comment: "Listening state")
case .processing:
return NSLocalizedString("menubar.processing", comment: "Processing state")
case .injecting:
return "Injecting" // Not shown in menu bar
case .error:
return "Error" // Not shown in menu bar
}
}
}

View file

@ -0,0 +1,51 @@
import Foundation
import os.log
public enum LogLevel: String, CaseIterable {
case debug = "DEBUG"
case info = "INFO"
case warning = "WARNING"
case error = "ERROR"
}
public class Logger {
private let osLog: OSLog
private let category: String
public init(category: String) {
self.category = category
self.osLog = OSLog(subsystem: "com.menuwhisper.app", category: category)
}
public func debug(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
log(level: .debug, message: message, file: file, function: function, line: line)
}
public func info(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
log(level: .info, message: message, file: file, function: function, line: line)
}
public func warning(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
log(level: .warning, message: message, file: file, function: function, line: line)
}
public func error(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
log(level: .error, message: message, file: file, function: function, line: line)
}
private func log(level: LogLevel, message: String, file: String, function: String, line: Int) {
let fileName = URL(fileURLWithPath: file).lastPathComponent
let logMessage = "[\(category)] \(message) (\(fileName):\(function):\(line))"
switch level {
case .debug:
os_log("%{public}@", log: osLog, type: .debug, logMessage)
case .info:
os_log("%{public}@", log: osLog, type: .info, logMessage)
case .warning:
os_log("%{public}@", log: osLog, type: .default, logMessage)
case .error:
os_log("%{public}@", log: osLog, type: .error, logMessage)
}
}
}

335
TECHSPEC.md Normal file
View file

@ -0,0 +1,335 @@
# Technical Definition — “Menu-Whisper” (macOS, Swift, Offline STT)
## 0) Owner Decisions (Locked)
- **Platform:** Apple Silicon only (M1/M2/M3), macOS 13+.
- **STT backends:** Start with **whisper.cpp (Metal)** for simplicity; add **Core ML** backend later.
- **Models:** Do **not** auto-download. On first run, user **chooses & downloads** a model.
- **VAD:** Post-MVP.
- **Insertion behavior:** Configurable; **direct insertion** is default (no preview).
- **Default hotkey:** **⌘⇧V** (user-configurable).
- **Punctuation:** Let the model handle punctuation automatically (no spoken commands).
- **Privacy/Connectivity:** 100% local at runtime; model downloads only when the user explicitly requests. **No telemetry**.
- **Distribution:** **.app/.dmg** (signed + notarized), outside the Mac App Store initially.
- **UI languages:** **ES/EN**.
- **Low-power mode:** Still allow downloads if the user starts them.
- **License:** **MIT**.
- **Per-dictation limit:** **10 minutes** by default (configurable).
---
## 1) Goal
A **menu bar** app for macOS that performs **offline speech-to-text** using Whisper-family models and **inserts the transcribed text** into whichever app currently has focus. Shows a minimal **HUD** while listening and processing. No internet required during normal operation.
---
## 2) MVP Scope
- Persistent **menu bar** item (NSStatusItem / `MenuBarExtra`).
- **Global hotkey** (push-to-talk and toggle modes).
- **HUD** (centered NSPanel + SwiftUI):
- “Listening” with audio-level animation (RMS/peak).
- “Processing” with a spinner/animation.
- **Offline STT** with **whisper.cpp** (GGUF models; Metal acceleration on Apple Silicon).
- **Model Manager**: curated list, manual download with progress + SHA256 check, user selection.
- **Text injection**:
- Preferred: **Clipboard + ⌘V** paste.
- Fallback: **simulated typing** via CGEvent.
- If **Secure Input** is active, **do not inject**; show notice and keep text on clipboard.
- **Preferences**: hotkey & mode, model & language, insertion method, HUD styling, sounds, dictation limit.
- **Permissions onboarding**: Microphone, Accessibility, Input Monitoring.
---
## 3) Functional Requirements
### 3.1 Capture
- Prompt for permissions on first use.
- Global hotkey (default ⌘⇧V).
- **Push-to-talk**: start on key down, stop on key up.
- **Toggle**: press to start, press again to stop.
- Per-dictation limit (default 10 min, range 10 s30 min).
### 3.2 HUD / UX
- Non-activating, centered **NSPanel** (~320×160), no focus stealing.
- **Listening**: bar-style audio visualization driven by live RMS/peak.
- **Processing**: spinner + “Transcribing…” label.
- **Esc** to cancel.
- Optional start/stop sounds (user-toggleable).
### 3.3 STT
- Backend A (MVP): **whisper.cpp** with **GGUF** and **Metal**.
- Language: auto-detect or forced (persisted).
- Basic text normalization; punctuation from the model.
- UTF-8 output; standard replacements (quotes, dashes, etc.).
### 3.4 Injection
- Preferred method: **NSPasteboard** + **CGEvent** to send ⌘V.
- Fallback: **CGEventCreateKeyboardEvent** (character-by-character), respecting active keyboard layout.
- **Secure Input**: detect with `IsSecureEventInputEnabled()`; if enabled, **do not inject**. Show a non-intrusive notice and leave the text on the clipboard.
### 3.5 Preferences
- **General:** hotkey + mode (push/toggle), sounds, HUD options.
- **Models:** catalog, download, select active model, language, local storage path.
- **Insertion:** direct vs preview (preview **off** by default), paste vs type.
- **Advanced:** limits, performance knobs (threads/batch), **local** logs opt-in.
---
## 4) Non-Functional Requirements
- **Offline** execution after models are installed.
- **Latency target** (M1 + “small” model): < 4 s for 10 s of audio.
- **Memory target:** ~1.52.5 GB with “small”.
- **Privacy:** audio and text never leave the device.
- **Accessibility:** sufficient contrast; VoiceOver labels; focus never stolen by HUD.
---
## 5) Architecture (High-Level)
- **App (SwiftUI)** with AppKit bridges for NSStatusItem and NSPanel.
- **Shortcut Manager** (Carbon `RegisterEventHotKey` or HotKey/MASShortcut).
- **Audio**: AVAudioEngine (downsample to 16 kHz mono, 16-bit PCM).
- **STT Engine**:
- **whisper.cpp** (C/C++ via SPM/CMake) with Metal.
- **Core ML backend** (e.g., WhisperKit / custom) in a later phase.
- **Model Manager**: curated catalog, downloads (progress + SHA256), selection, caching.
- **Text Injection**: pasteboard + CGEvent; typing fallback; Secure Input detection.
- **Permissions Manager**: guided flows to System Settings panes.
- **Settings**: UserDefaults + JSON export/import.
- **Packaging**: .app + .dmg (signed & notarized).
---
## 6) Main Flow
1. User presses global hotkey.
2. Check permissions; guide if missing.
3. Show HUD → **Listening**; start capture.
4. Stop (key up/toggle/timeout).
5. HUD → **Processing**; run STT in background.
6. On result → (optional preview) → **insert** (paste) or **fallback** (type). If Secure Input, **do not inject**; keep in clipboard + show notice.
7. Close HUD → **Idle**.
---
## 7) Finite State Machine (FSM)
- **Idle** → (Hotkey) → **Listening**
- **Listening** → (Stop/Timeout) → **Processing**
- **Processing** → (Done) → **Injecting**
- **Injecting** → (Done) → **Idle**
- Any → (Error) → **ErrorModal** → **Idle**
---
## 8) Model Management (Manual Downloads)
**Goal:** Offer a clear list of **free** Whisper-family models (names, sizes, languages, recommended backend) with one-click downloads. No automatic downloads.
### 8.1 OpenAI Whisper (official weights)
- Families: **tiny**, **base**, **small**, **medium**, **large-v2**, **large-v3** (multilingual; some `.en` variants).
- Usable with **whisper.cpp** via **GGUF** (community conversions widely available).
### 8.2 Whisper for whisper.cpp (converted GGUF)
- Community-maintained conversions for whisper.cpp (GGUF), optimized for CPU/GPU Metal on macOS.
### 8.3 Faster-Whisper (CTranslate2)
- Optimized variants (tiny/base/small/medium/large-v2/large-v3). Useful if a CT2-based or Core-ML-assisted backend is added later.
### 8.4 Distil-Whisper (distilled)
- Distilled models (e.g., **distil-large-v2/v3/v3.5**, **distil-small.en**), significantly smaller/faster with near-large accuracy.
> **UI must show:** model file size, languages, license, **RAM estimate**, and a warning if a large model is selected on lower-memory machines.
**Optional JSON Schema for catalog entries (for the apps first-run picker):**
```json
{
"name": "whisper-small",
"family": "OpenAI-Whisper",
"format": "gguf",
"size_mb": 466,
"languages": ["multilingual"],
"recommended_backend": "whisper.cpp",
"quality_tier": "small",
"license": "MIT",
"sha256": "…",
"download_url": "…",
"notes": "Good balance of speed/accuracy on M1/M2."
}
```
---
## 9) Security & Permissions
* **Info.plist:** `NSMicrophoneUsageDescription`.
* **Accessibility & Input Monitoring:** required for CGEvent; provide clear step-by-step guidance and deep-links.
* **Secure Input:** check `IsSecureEventInputEnabled()`; **never** attempt to bypass. Provide help text to identify apps that enable it (password fields, 2FA prompts, etc.).
---
## 10) Performance
* Lazy-load and reuse model (warm cache).
* Real-time downsampling to 16 kHz mono; chunked streaming into backend.
* Configurable threads; prefer **Metal** path on Apple Silicon.
* “Fast path” tweaks for short clips (<15 s).
---
## 11) Logging & Privacy
* **No remote telemetry.**
* Local logs **opt-in** (timings, errors only). Never store audio/text unless user explicitly enables a debug flag.
* “Wipe local data” button (models remain unless the user removes them).
---
## 12) Internationalization
* UI in **Spanish** and **English** (Localizable.strings).
* STT multilingual; language auto or forced per user preference.
---
## 13) Testing (Minimum)
* macOS 13/14/15 on M1/M2/M3.
* Injection works in Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail.
* **Secure Input**: correctly detected; no injection; clipboard + notice.
* Meet latency target with **small** model on M1.
* Model download & selection flows (simulate network errors).
---
## 14) Phased Plan (AI-Deliverables)
### Phase 0 — Scaffolding (MVP-0)
**Goal:** Base project + menubar.
**Deliverables:**
* SwiftUI app with `MenuBarExtra`, microphone icon, “Idle” state.
* `ARCHITECTURE.md` describing modules (Audio/STT/Injection/Models/Permissions/Settings).
* Build scripts and signing/notarization templates.
**DoD:** Compiles; menu bar item visible; SPM structure ready.
---
### Phase 1 — Hotkey + HUD + Audio (MVP-1)
**Goal:** Listening UX without real STT.
**Deliverables:**
* Global hotkey (default ⌘⇧V) with **push** and **toggle**.
* NSPanel HUD (Listening/Processing) + **real** RMS bars from AVAudioEngine.
* Per-dictation limit (default 10 min).
**DoD:** Live meter responds to mic; correct state transitions.
---
### Phase 2 — STT via whisper.cpp (MVP-2)
**Goal:** Real offline transcription.
**Deliverables:**
* **whisper.cpp** module (C/C++), background inference with **Metal**.
* **Model Manager** (curated list, download with SHA256, selection).
* Language auto/forced; basic normalization.
**DoD:** 10-second clip → coherent ES/EN text offline; meets timing targets.
---
### Phase 3 — Robust Insertion (MVP-3)
**Goal:** Reliable insertion into focused app.
**Deliverables:**
* Paste (clipboard + ⌘V) and typing fallback.
* **Secure Input** detection; safe behavior (no injection, clipboard + notice).
**DoD:** Works across target apps; correct Secure Input handling.
---
### Phase 4 — Preferences + UX Polish (MVP-4)
**Goal:** Complete options & stability.
**Deliverables:**
* Full Preferences (hotkey, modes, model, language, insertion, HUD, sounds).
* Optional preview dialog (off by default).
* Config export/import (JSON).
**DoD:** All settings persist and are honored.
---
### Phase 5 — Distribution (MVP-5)
**Goal:** Installable package.
**Deliverables:**
* Error handling; permission prompts & help (incl. Secure Input troubleshooting).
* **.dmg** (signed + notarized) and install guide.
* **USER\_GUIDE.md** + **TROUBLESHOOTING.md**.
**DoD:** Clean install on test machines; distribution checklist passed.
---
### Phase 6 — Core ML Backend (Post-MVP)
**Goal:** Second backend.
**Deliverables:**
* **Core ML** integration (e.g., WhisperKit or custom conversion).
* Backend selector (whisper.cpp/Core ML) in Preferences; local benchmarks table.
**DoD:** Feature parity and stability; documented pros/cons.
---
## 15) Mini-Prompts for the Builder AI (per Phase)
* **P0:** “Create macOS 13+ SwiftUI menubar app (`MenuBarExtra`), microphone icon, SPM layout with modules in `ARCHITECTURE.md`.”
* **P1:** “Add global hotkey (push & toggle) with `RegisterEventHotKey`; NSPanel HUD with RMS bars from AVAudioEngine; 10-minute dictation limit.”
* **P2:** “Integrate **whisper.cpp** (Metal); add Model Manager (curated list, SHA256-verified downloads, selection); language auto/forced; transcribe WAV 16 kHz mono.”
* **P3:** “Implement insertion: pasteboard+⌘V and CGEvent typing fallback; detect `IsSecureEventInputEnabled()` and avoid injection.”
* **P4:** “Implement full Preferences, optional preview, JSON export/import; UX polish and messages.”
* **P5:** “Signing + notarization; produce .dmg; write USER\_GUIDE and TROUBLESHOOTING (with Secure Input section).”
* **P6:** “Add Core ML backend (WhisperKit/custom), backend selector, and local benchmarks.”
---
## 16) Suggested Repo Layout
```
MenuWhisper/
Sources/
App/ # SwiftUI + AppKit bridges
Core/
Audio/ # AVAudioEngine capture + meters
STT/
WhisperCPP/ # C/C++ wrapper + Metal path
CoreML/ # post-MVP
Models/ # catalog, downloads, hashes
Injection/ # clipboard, CGEvent typing, secure input checks
Permissions/
Settings/
Utils/
Resources/ # icons, sounds, localizations
Docs/ # ARCHITECTURE.md, USER_GUIDE.md, TROUBLESHOOTING.md
Scripts/ # build, sign, notarize
Tests/ # unit + integration
```
---
## 17) Risks & Mitigations
* **Hotkey collision (⌘⇧V)** with “Paste and Match Style” in some apps → make it discoverable & easily rebindable; warn on conflict.
* **Secure Input** blocks injection → inform the user, keep text on clipboard, provide help to identify the app enabling it.
* **RAM/latency** with large models → recommend **small/base** by default; show RAM/latency hints in the model picker.
* **Keyboard layouts** → prefer paste; if typing, map using the active layout.
---
## 18) Global MVP Definition of Done
* A 3090 s dictation yields accurate ES/EN text **offline** and inserts correctly in common apps.
* Secure Input is correctly detected and handled.
* Model download/selection is robust and user-driven.
* Shippable **.dmg** (signed + notarized) and clear docs included.

223
TODO.md Normal file
View file

@ -0,0 +1,223 @@
```markdown
# TODO — Menu-Whisper (macOS, Swift, Offline STT)
This file tracks the tasks needed to deliver the app in **phases** with clear acceptance checks.
Conventions:
- `[ ]` = to do, `[x]` = done
- **AC** = Acceptance Criteria
- All features must work **offline** after models are installed.
---
## Global / Project-Wide
- [x] Set project license to **MIT** and add `LICENSE` file.
- [x] Add `README.md` with high-level summary, build requirements (Xcode, macOS 13+), Apple Silicon-only note.
- [x] Add `Docs/ARCHITECTURE.md` skeleton (to be filled in Phase 0).
- [x] Create base **localization** scaffolding (`en.lproj`, `es.lproj`) with `Localizable.strings`.
- [x] Add SwiftPM structure with separate targets for `App`, `Core/*` modules.
- [x] Prepare optional tooling:
- [x] SwiftFormat / SwiftLint config (opt-in).
- [x] GitHub Actions macOS runner for **build-only** CI (optional).
---
## Phase 0 — Scaffolding (MVP-0)
**Goal:** Base project + menu bar item; structure and docs.
### Tasks
- [x] Create SwiftUI macOS app (macOS 13+) with `MenuBarExtra` / `NSStatusItem`.
- [x] Add placeholder mic icon (template asset).
- [x] Create module targets:
- [x] `Core/Audio`
- [x] `Core/STT` (with subfolders `WhisperCPP` and `CoreML` (stub))
- [x] `Core/Models`
- [x] `Core/Injection`
- [x] `Core/Permissions`
- [x] `Core/Settings`
- [x] `Core/Utils`
- [x] Wire a minimal state machine: `Idle` state shown in menubar menu.
- [x] Add scripts:
- [x] `Scripts/build.sh` (SPM/Xcodebuild)
- [x] `Scripts/notarize.sh` (stub with placeholders for later)
- [x] Write `Docs/ARCHITECTURE.md` (modules, data flow, FSM diagram).
### AC
- [x] Project compiles and shows a **menu bar** icon with a basic menu.
- [x] Repo has clear structure and architecture doc.
---
## Phase 1 — Hotkey + HUD + Audio (MVP-1)
**Goal:** Listening UX without real STT.
### Tasks
- [ ] Implement **global hotkey** manager:
- [ ] Default **⌘⇧V** (configurable later).
- [ ] Support **push-to-talk** (start on key down, stop on key up).
- [ ] Support **toggle** (press to start, press to stop).
- [ ] Create **HUD** as non-activating centered `NSPanel`:
- [ ] State **Listening** with **RMS/peak bars** animation (SwiftUI view).
- [ ] State **Processing** with spinner/label.
- [ ] Dismiss/cancel with **Esc**.
- [ ] Implement **AVAudioEngine** capture:
- [ ] Tap on input bus; compute RMS/peak for visualization.
- [ ] Resample path ready for 16 kHz mono PCM (no STT yet).
- [ ] Add dictation **time limit** (default **10 min**, configurable later).
- [ ] Optional **sounds** for start/stop (toggle in settings later).
- [ ] Permissions onboarding:
- [ ] Request **Microphone** permission with Info.plist string.
- [ ] Show guide for **Accessibility** and **Input Monitoring** (no hard gating yet).
### AC
- [ ] Hotkey works in both modes (push/toggle) across desktop & full-screen apps.
- [ ] HUD appears centered; **Listening** shows live bars; **Processing** shows spinner.
- [ ] Cancel (Esc) reliably stops listening and hides HUD.
---
## Phase 2 — STT via whisper.cpp (MVP-2)
**Goal:** Real offline transcription (Apple Silicon + Metal).
### Tasks
- [ ] Add **whisper.cpp** integration:
- [ ] Vendor/SwiftPM/Wrapper target for C/C++.
- [ ] Build with **Metal** path enabled on Apple Silicon.
- [ ] Define `STTEngine` protocol and `WhisperCPPSTTEngine` implementation.
- [ ] Audio pipeline:
- [ ] Convert captured audio to **16 kHz mono** 16-bit PCM.
- [ ] Chunking/streaming into STT worker; end-of-dictation triggers transcription.
- [ ] **Model Manager** (backend + minimal UI):
- [ ] Bundle a **curated JSON catalog** (name, size, languages, license, URL, SHA256).
- [ ] Download via `URLSession` with progress + resume support.
- [ ] Validate **SHA256**; store under `~/Library/Application Support/MenuWhisper/Models`.
- [ ] Allow **select active model**; persist selection.
- [ ] Language: **auto** or **forced** (persist).
- [ ] Text normalization pass (basic replacements; punctuation from model).
- [ ] Error handling (network failures, disk full, missing model).
- [ ] Performance knobs (threads, GPU toggle if exposed by backend).
### AC
- [ ] A **10 s** clip produces coherent **ES/EN** text **offline**.
- [ ] Latency target: **< 4 s** additional for 10 s clip on M1 with **small** model.
- [ ] Memory: ~**1.52.5 GB** with small model without leaks.
- [ ] Model download: progress UI + SHA256 verification + selection works.
---
## Phase 3 — Robust Text Insertion (MVP-3)
**Goal:** Insert text into focused app safely; handle Secure Input.
### Tasks
- [ ] Implement **Paste** method:
- [ ] Put text on **NSPasteboard** (general).
- [ ] Send **⌘V** via CGEvent to focused app.
- [ ] Implement **Typing** fallback:
- [ ] Generate per-character **CGEvent**; respect active keyboard layout.
- [ ] Handle `\n`, `\t`, and common unicode safely.
- [ ] Detect **Secure Input**:
- [ ] Use `IsSecureEventInputEnabled()` (or accepted API) check before injection.
- [ ] If enabled: **do not inject**; keep text on clipboard; show non-blocking notice.
- [ ] Add preference for **insertion method** (Paste preferred) + fallback strategy.
- [ ] Add **Permissions** helpers for Accessibility/Input Monitoring (deep links).
- [ ] Compatibility tests: Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail.
### AC
- [ ] Text reliably appears in the currently focused app via Paste.
- [ ] If Paste is blocked, Typing fallback works (except in Secure Input).
- [ ] When **Secure Input** is active: no injection occurs; clipboard contains the text; user is informed.
---
## Phase 4 — Preferences + UX Polish (MVP-4)
**Goal:** Complete options, localization, and stability.
### Tasks
- [ ] Full **Preferences** window:
- [ ] Hotkey recorder (change ⌘⇧V if needed).
- [ ] Mode: Push-to-talk / Toggle.
- [ ] Model picker: list, **download**, **delete**, **set active**, show size/language/license.
- [ ] Language: Auto / Forced (dropdown).
- [ ] Insertion: **Direct** (default) vs **Preview**; Paste vs Typing preference.
- [ ] HUD: opacity/size, show/hide sounds toggles.
- [ ] Dictation limit: editable (default 10 min).
- [ ] Advanced: threads/batch; **local logs opt-in**.
- [ ] **Export/Import** settings (JSON).
- [ ] Implement **Preview** dialog (off by default): shows transcribed text with **Insert** / **Cancel**.
- [ ] Expand **localization** (ES/EN) for all UI strings.
- [ ] Onboarding & help views (permissions, Secure Input explanation).
- [ ] Persist all settings in `UserDefaults`; validate on load; migrate if needed.
- [ ] UX polish: icons, animation timing, keyboard navigation, VoiceOver labels.
- [ ] Optional: internal **timing instrumentation** (guarded by logs opt-in).
### AC
- [ ] All preferences persist and take effect without relaunch.
- [ ] Preview (when enabled) allows quick edit & insertion.
- [ ] ES/EN localization passes a manual spot-check.
---
## Phase 5 — Distribution (MVP-5)
**Goal:** Shippable, signed/notarized .dmg, user docs.
### Tasks
- [ ] Hardened runtime, entitlements, Info.plist:
- [ ] `NSMicrophoneUsageDescription`
- [ ] Review for any additional required entitlements.
- [ ] **Code signing** with Developer ID; set team identifiers.
- [ ] **Notarization** using `notarytool`; **staple** on success.
- [ ] Build **.app** and create **.dmg**:
- [ ] DMG background, /Applications symlink, icon.
- [ ] Write **Docs/USER_GUIDE.md** (first run, downloading models, dictation flow).
- [ ] Write **Docs/TROUBLESHOOTING.md** (permissions, Secure Input, model space/RAM issues).
- [ ] QA matrix:
- [ ] macOS **13/14/15**, Apple Silicon **M1/M2/M3**.
- [ ] Target apps list (insertion works).
- [ ] Offline check (network disabled).
- [ ] Prepare **VERSIONING** notes and changelog (semantic-ish).
### AC
- [ ] Signed & **notarized** .dmg installs cleanly.
- [ ] App functions **entirely offline** post-model download.
- [ ] Guides are complete and reference all common pitfalls.
---
## Phase 6 — Core ML Backend (Post-MVP)
**Goal:** Second STT backend and selector.
### Tasks
- [ ] Evaluate **Core ML** path (e.g., WhisperKit or custom Core ML models).
- [ ] Implement `STTEngineCoreML` conforming to `STTEngine` protocol.
- [ ] Backend **selector** in Preferences; runtime switching.
- [ ] Ensure **feature parity** (language settings, output normalization).
- [ ] **Benchmarks**: produce local latency/memory table across small/base/medium.
- [ ] Errors & fallbacks (if model missing, surface helpful guidance).
### AC
- [ ] Both backends run on Apple Silicon; user can switch backends.
- [ ] Comparable outputs; documented pros/cons and performance data.
---
## Backlog / Post-MVP Options
- [ ] **VAD (WebRTC)**: auto-stop on silence with thresholds.
- [ ] **Continuous dictation** with smart segmentation.
- [ ] **Noise suppression** and AGC in the audio pipeline.
- [ ] **Login item** (auto-launch at login).
- [ ] **Sparkle** or custom updater (if desirable outside App Store).
- [ ] **Settings profiles** (per-language/model presets).
- [ ] **In-app model catalog refresh** (remote JSON update).
- [ ] **Advanced insertion rules** (per-app behavior).
- [ ] **Analytics viewer** for local logs (no telemetry).
---
```

View file

@ -0,0 +1,10 @@
import XCTest
@testable import MenuWhisperAudio
final class AudioEngineTests: XCTestCase {
func testAudioEngineInitialization() {
let engine = AudioEngine()
XCTAssertNotNil(engine)
XCTAssertFalse(engine.isCapturing)
}
}

View file

@ -0,0 +1,9 @@
import XCTest
@testable import CoreInjection
final class TextInjectorTests: XCTestCase {
func testTextInjectorInitialization() {
let injector = TextInjector()
XCTAssertNotNil(injector)
}
}

View file

@ -0,0 +1,10 @@
import XCTest
@testable import CoreModels
final class ModelManagerTests: XCTestCase {
func testModelManagerInitialization() {
let manager = ModelManager()
XCTAssertNotNil(manager)
XCTAssertEqual(manager.availableModels.count, 0)
}
}

View file

@ -0,0 +1,9 @@
import XCTest
@testable import CorePermissions
final class PermissionManagerTests: XCTestCase {
func testPermissionManagerInitialization() {
let manager = PermissionManager()
XCTAssertNotNil(manager)
}
}

View file

@ -0,0 +1,10 @@
import XCTest
@testable import CoreSTT
final class STTEngineTests: XCTestCase {
func testWhisperCPPEngineInitialization() {
let engine = WhisperCPPEngine()
XCTAssertNotNil(engine)
XCTAssertFalse(engine.isModelLoaded())
}
}

View file

@ -0,0 +1,10 @@
import XCTest
@testable import CoreSettings
final class SettingsTests: XCTestCase {
func testSettingsInitialization() {
let settings = Settings()
XCTAssertNotNil(settings)
XCTAssertEqual(settings.hotkeyMode, .pushToTalk)
}
}

View file

@ -0,0 +1,17 @@
import XCTest
@testable import CoreUtils
final class LoggerTests: XCTestCase {
func testLoggerInitialization() {
let logger = Logger(category: "Test")
logger.info("Test message")
}
func testLoggerLevels() {
let logger = Logger(category: "Test")
logger.debug("Debug message")
logger.info("Info message")
logger.warning("Warning message")
logger.error("Error message")
}
}