From 1db16227b232ad49af6afcaa27b9a9c0b96d88b6 Mon Sep 17 00:00:00 2001 From: "Felipe M." Date: Thu, 18 Sep 2025 19:56:06 +0200 Subject: [PATCH] Initial commit --- .github/workflows/build.yml | 57 +++ .gitignore | 37 ++ .swiftformat | 41 +++ .swiftlint.yml | 92 +++++ Docs/ARCHITECTURE.md | 243 +++++++++++++ LICENSE | 21 ++ Package.swift | 123 +++++++ README.md | 96 +++++ .../en.lproj/Localizable.strings | 77 ++++ .../es.lproj/Localizable.strings | 77 ++++ Scripts/build.sh | 38 ++ Scripts/notarize.sh | 35 ++ Sources/App/main.swift | 18 + Sources/CoreAudio/AudioEngine.swift | 42 +++ Sources/CoreInjection/TextInjector.swift | 73 ++++ Sources/CoreModels/ModelManager.swift | 70 ++++ .../CorePermissions/PermissionManager.swift | 111 ++++++ Sources/CoreSTT/STTEngine.swift | 32 ++ .../CoreSTT/WhisperCPP/WhisperCPPEngine.swift | 35 ++ Sources/CoreSettings/Settings.swift | 149 ++++++++ Sources/CoreUtils/AppState.swift | 24 ++ Sources/CoreUtils/Logger.swift | 51 +++ TECHSPEC.md | 335 ++++++++++++++++++ TODO.md | 223 ++++++++++++ Tests/CoreAudioTests/AudioEngineTests.swift | 10 + .../TextInjectorTests.swift | 9 + Tests/CoreModelsTests/ModelManagerTests.swift | 10 + .../PermissionManagerTests.swift | 9 + Tests/CoreSTTTests/STTEngineTests.swift | 10 + Tests/CoreSettingsTests/SettingsTests.swift | 10 + Tests/CoreUtilsTests/LoggerTests.swift | 17 + 31 files changed, 2175 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 .gitignore create mode 100644 .swiftformat create mode 100644 .swiftlint.yml create mode 100644 Docs/ARCHITECTURE.md create mode 100644 LICENSE create mode 100644 Package.swift create mode 100644 README.md create mode 100644 Resources/Localizations/en.lproj/Localizable.strings create mode 100644 Resources/Localizations/es.lproj/Localizable.strings create mode 100755 Scripts/build.sh create mode 100755 Scripts/notarize.sh create mode 100644 Sources/App/main.swift create mode 100644 Sources/CoreAudio/AudioEngine.swift create mode 100644 Sources/CoreInjection/TextInjector.swift create mode 100644 Sources/CoreModels/ModelManager.swift create mode 100644 Sources/CorePermissions/PermissionManager.swift create mode 100644 Sources/CoreSTT/STTEngine.swift create mode 100644 Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift create mode 100644 Sources/CoreSettings/Settings.swift create mode 100644 Sources/CoreUtils/AppState.swift create mode 100644 Sources/CoreUtils/Logger.swift create mode 100644 TECHSPEC.md create mode 100644 TODO.md create mode 100644 Tests/CoreAudioTests/AudioEngineTests.swift create mode 100644 Tests/CoreInjectionTests/TextInjectorTests.swift create mode 100644 Tests/CoreModelsTests/ModelManagerTests.swift create mode 100644 Tests/CorePermissionsTests/PermissionManagerTests.swift create mode 100644 Tests/CoreSTTTests/STTEngineTests.swift create mode 100644 Tests/CoreSettingsTests/SettingsTests.swift create mode 100644 Tests/CoreUtilsTests/LoggerTests.swift diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..643bce8 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,57 @@ +name: Build + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + +jobs: + build: + name: Build Menu-Whisper + runs-on: macos-13 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Select Xcode version + run: sudo xcode-select -s /Applications/Xcode_15.0.app/Contents/Developer + + - name: Show Xcode version + run: xcodebuild -version + + - name: Show Swift version + run: swift --version + + - name: Cache Swift Package Manager + uses: actions/cache@v3 + with: + path: .build + key: ${{ runner.os }}-spm-${{ hashFiles('**/Package.resolved') }} + restore-keys: | + ${{ runner.os }}-spm- + + - name: Build with Swift Package Manager + run: swift build -c release + + - name: Run tests + run: swift test + + - name: Check code formatting (SwiftFormat) + run: | + # Install SwiftFormat if available + if command -v swiftformat >/dev/null 2>&1; then + swiftformat --lint . + else + echo "SwiftFormat not available, skipping format check" + fi + + - name: Run SwiftLint + run: | + # Install SwiftLint if available + if command -v swiftlint >/dev/null 2>&1; then + swiftlint + else + echo "SwiftLint not available, skipping lint check" + fi \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48382e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Build artifacts +.build/ +build/ +DerivedData/ + +# Swift Package Manager +.swiftpm/ +Package.resolved + +# Xcode +*.xcodeproj/ +*.xcworkspace/ +xcuserdata/ +*.xccheckout +*.moved-aside + +# macOS +.DS_Store +.AppleDouble +.LSOverride + +# Temporary files +*.tmp +*.swp +*.swo +*~ + +# Logs +*.log + +# IDE +.vscode/ +.idea/ + +# Environment +.env +.env.local \ No newline at end of file diff --git a/.swiftformat b/.swiftformat new file mode 100644 index 0000000..f6693c6 --- /dev/null +++ b/.swiftformat @@ -0,0 +1,41 @@ +# SwiftFormat Configuration for Menu-Whisper + +# Indentation +--indent 4 +--indentcase false +--smarttabs enabled + +# Spacing +--spaces 4 +--trailingspace ignore +--semicolons never + +# Line breaks +--maxwidth 120 +--linebreaks lf +--wraparguments preserve +--wraptypealiases preserve +--wrapparameters preserve +--wrapcollections preserve + +# Braces +--allman false +--elseposition same-line + +# Comments +--stripunusedargs closure-only + +# Imports +--importgrouping testable-bottom + +# Other formatting options +--redundanttype inferred +--closingparen balanced +--commas inline +--trimwhitespace always +--insertlines enabled +--removelines enabled +--emptybraces no-space + +# Disable certain rules that conflict with team preferences +--disable redundantSelf \ No newline at end of file diff --git a/.swiftlint.yml b/.swiftlint.yml new file mode 100644 index 0000000..3628e13 --- /dev/null +++ b/.swiftlint.yml @@ -0,0 +1,92 @@ +# SwiftLint Configuration for Menu-Whisper + +# Paths to include/exclude +included: + - Sources + - Tests + +excluded: + - Pods + - .build + - DerivedData + +# Rules configuration +disabled_rules: + - trailing_comma + - todo + - force_cast + - force_try + +opt_in_rules: + - empty_count + - empty_string + - contains_over_first_not_nil + - closure_spacing + - multiline_function_chains + - multiline_literal_brackets + - multiline_parameters + - operator_usage_whitespace + - overridden_super_call + - private_outlet + - prohibited_super_call + - redundant_nil_coalescing + - switch_case_alignment + - unneeded_parentheses_in_closure_argument + - vertical_parameter_alignment_on_call + +# Line length +line_length: + warning: 120 + error: 140 + +# File length +file_length: + warning: 500 + error: 800 + +# Function length +function_body_length: + warning: 50 + error: 100 + +# Type body length +type_body_length: + warning: 300 + error: 400 + +# Cyclomatic complexity +cyclomatic_complexity: + warning: 10 + error: 20 + +# Nesting depth +nesting: + type_level: 3 + statement_level: 5 + +# Large tuple +large_tuple: + warning: 3 + error: 4 + +# Identifier names +identifier_name: + min_length: + warning: 2 + excluded: + - id + - URL + - url + - x + - y + - z + +# Custom rules +custom_rules: + no_print: + name: "No Print Statements" + regex: "print\\(" + message: "Use Logger instead of print statements" + severity: warning + +reporter: "xcode" \ No newline at end of file diff --git a/Docs/ARCHITECTURE.md b/Docs/ARCHITECTURE.md new file mode 100644 index 0000000..a4990d6 --- /dev/null +++ b/Docs/ARCHITECTURE.md @@ -0,0 +1,243 @@ +# Architecture — Menu-Whisper + +This document describes the high-level architecture and module organization for Menu-Whisper, a macOS offline speech-to-text application. + +## Overview + +Menu-Whisper follows a modular architecture with clear separation of concerns between UI, audio processing, speech recognition, text injection, and system integration components. + +## System Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ App Layer │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ MenuBarExtra │ │ HUD Panel │ │ Preferences │ │ +│ │ (SwiftUI) │ │ (SwiftUI) │ │ (SwiftUI) │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────┘ │ +└─────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Core Modules │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ Audio │ │ STT │ │ Injection │ │ +│ │ AVAudioEngine │ │ whisper.cpp │ │ Clipboard │ │ +│ │ RMS/Peak │ │ Core ML │ │ Typing │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────┘ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ Models │ │ Permissions │ │ Settings │ │ +│ │ Management │ │ Microphone │ │ UserDefaults│ │ +│ │ Downloads │ │ Accessibility │ │ JSON Export │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────┘ │ +└─────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ System Integration │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ Global Hotkeys │ │ Secure Input │ │ Utils │ │ +│ │ Carbon │ │ Detection │ │ Helpers │ │ +│ │ RegisterHotKey │ │ CGEvent API │ │ │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +## Module Descriptions + +### App Layer +- **MenuBarExtra**: SwiftUI-based menu bar interface using `MenuBarExtra` for macOS 13+ +- **HUD Panel**: Non-activating NSPanel for "Listening" and "Processing" states +- **Preferences**: Settings window with model management, hotkey configuration, etc. + +### Core Modules + +#### Core/Audio +**Purpose**: Audio capture and real-time processing +- AVAudioEngine integration for microphone input +- Real-time RMS/peak computation for visual feedback +- Audio format conversion (16kHz mono PCM for STT) +- Dictation time limits and session management + +#### Core/STT +**Purpose**: Speech-to-text processing with multiple backends +- **WhisperCPP**: Primary backend using whisper.cpp with Metal acceleration +- **CoreML**: Future backend for Core ML models (Phase 6) +- `STTEngine` protocol for backend abstraction +- Language detection and text normalization + +#### Core/Models +**Purpose**: Model catalog, downloads, and management +- Curated model catalog (JSON-based) +- Download management with progress tracking +- SHA256 verification and integrity checks +- Local storage in `~/Library/Application Support/MenuWhisper/Models` +- Model selection and metadata management + +#### Core/Injection +**Purpose**: Text insertion into focused applications +- Clipboard-based insertion (preferred method) +- Character-by-character typing fallback +- Secure Input detection and handling +- Cross-application compatibility layer + +#### Core/Permissions +**Purpose**: System permission management and onboarding +- Microphone access (AVAudioSession) +- Accessibility permissions for text injection +- Input Monitoring permissions for global hotkeys +- Permission status checking and guidance flows + +#### Core/Settings +**Purpose**: User preferences and configuration persistence +- UserDefaults-based storage +- JSON export/import functionality +- Settings validation and migration +- Configuration change notifications + +### System Integration + +#### Global Hotkeys +- Carbon framework integration (`RegisterEventHotKey`) +- Push-to-talk and toggle modes +- Hotkey conflict detection and user guidance +- Cross-application hotkey handling + +#### Secure Input Detection +- `IsSecureEventInputEnabled()` monitoring +- Safe fallback behavior (clipboard-only) +- User notification for secure contexts + +#### Utils +- Shared utilities and helper functions +- Logging infrastructure (opt-in local logs) +- Error handling and user feedback + +## Data Flow + +### Main Operational Flow +``` +User Hotkey → Audio Capture → STT Processing → Text Injection + ▲ │ │ │ + │ ▼ ▼ ▼ + Hotkey Mgr Audio Buffer Model Engine Injection Mgr + │ RMS/Peak whisper.cpp Clipboard/Type + │ │ │ │ + ▼ ▼ ▼ ▼ + HUD UI Visual Feedback Processing UI Target App +``` + +### State Management +The application follows a finite state machine pattern: +- **Idle**: Waiting for user input +- **Listening**: Capturing audio with visual feedback +- **Processing**: Running STT inference +- **Injecting**: Inserting text into target application +- **Error**: Handling and displaying errors + +## Finite State Machine + +``` + ┌─────────────┐ + │ Idle │◄─────────────┐ + └─────────────┘ │ + │ │ + │ Hotkey Press │ Success/Error + ▼ │ + ┌─────────────┐ │ + │ Listening │ │ + └─────────────┘ │ + │ │ + │ Stop/Timeout │ + ▼ │ + ┌─────────────┐ │ + │ Processing │ │ + └─────────────┘ │ + │ │ + │ STT Complete │ + ▼ │ + ┌─────────────┐ │ + │ Injecting │──────────────┘ + └─────────────┘ +``` + +## Technology Stack + +### Core Technologies +- **Swift 5.9+**: Primary development language +- **SwiftUI**: User interface framework +- **AppKit**: macOS-specific UI components (NSStatusItem, NSPanel) +- **AVFoundation**: Audio capture and processing +- **Carbon**: Global hotkey registration + +### External Dependencies +- **whisper.cpp**: C/C++ speech recognition engine with Metal support +- **Swift Package Manager**: Dependency management and build system + +### Platform Integration +- **UserDefaults**: Settings persistence +- **NSPasteboard**: Clipboard operations +- **CGEvent**: Low-level input simulation +- **URLSession**: Model downloads + +## Build System + +The project uses Swift Package Manager with modular targets: + +``` +MenuWhisper/ +├── Package.swift # SPM configuration +├── Sources/ +│ ├── App/ # Main application target +│ ├── CoreAudio/ # Audio processing module +│ ├── CoreSTT/ # Speech-to-text engines +│ ├── CoreModels/ # Model management +│ ├── CoreInjection/ # Text insertion +│ ├── CorePermissions/ # System permissions +│ ├── CoreSettings/ # User preferences +│ └── CoreUtils/ # Shared utilities +├── Resources/ # Assets, localizations +└── Tests/ # Unit and integration tests +``` + +## Security Considerations + +### Privacy +- All audio processing occurs locally +- No telemetry or data collection +- Optional local logging with user consent + +### System Security +- Respects Secure Input contexts +- Requires explicit user permission grants +- Code signing and notarization for distribution + +### Input Safety +- Validates all user inputs +- Safe handling of special characters in typing mode +- Proper escaping for different keyboard layouts + +## Performance Characteristics + +### Target Metrics +- **Latency**: <4s additional processing time for 10s audio (M1 + small model) +- **Memory**: ~1.5-2.5GB with small model +- **Model Loading**: Lazy loading with warm cache +- **UI Responsiveness**: Non-blocking background processing + +### Optimization Strategies +- Metal acceleration for STT inference +- Efficient audio buffering and streaming +- Model reuse across dictation sessions +- Configurable threading for CPU-intensive operations + +## Future Extensibility + +The modular architecture supports future enhancements: +- Additional STT backends (Core ML, cloud services) +- Voice Activity Detection (VAD) +- Advanced audio preprocessing +- Custom insertion rules per application +- Plugin architecture for text processing + +This architecture provides a solid foundation for the MVP while maintaining flexibility for future feature additions and platform evolution. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e27a1fe --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Menu-Whisper + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Package.swift b/Package.swift new file mode 100644 index 0000000..93c9dd1 --- /dev/null +++ b/Package.swift @@ -0,0 +1,123 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "MenuWhisper", + platforms: [ + .macOS(.v13) + ], + products: [ + .executable( + name: "MenuWhisper", + targets: ["App"] + ) + ], + dependencies: [ + // Add external dependencies here as needed + // Example: .package(url: "...", from: "1.0.0") + ], + targets: [ + // Main Application Target + .executableTarget( + name: "App", + dependencies: [ + "MenuWhisperAudio", + "CoreSTT", + "CoreModels", + "CoreInjection", + "CorePermissions", + "CoreSettings", + "CoreUtils" + ], + path: "Sources/App", + resources: [ + .copy("../../Resources") + ] + ), + + // Core Module Targets + .target( + name: "MenuWhisperAudio", + dependencies: ["CoreUtils"], + path: "Sources/CoreAudio" + ), + + .target( + name: "CoreSTT", + dependencies: ["CoreUtils", "CoreModels", "MenuWhisperAudio"], + path: "Sources/CoreSTT" + ), + + .target( + name: "CoreModels", + dependencies: ["CoreUtils"], + path: "Sources/CoreModels" + ), + + .target( + name: "CoreInjection", + dependencies: ["CoreUtils"], + path: "Sources/CoreInjection" + ), + + .target( + name: "CorePermissions", + dependencies: ["CoreUtils"], + path: "Sources/CorePermissions" + ), + + .target( + name: "CoreSettings", + dependencies: ["CoreUtils"], + path: "Sources/CoreSettings" + ), + + .target( + name: "CoreUtils", + path: "Sources/CoreUtils" + ), + + // Test Targets + .testTarget( + name: "MenuWhisperAudioTests", + dependencies: ["MenuWhisperAudio"], + path: "Tests/CoreAudioTests" + ), + + .testTarget( + name: "CoreSTTTests", + dependencies: ["CoreSTT"], + path: "Tests/CoreSTTTests" + ), + + .testTarget( + name: "CoreModelsTests", + dependencies: ["CoreModels"], + path: "Tests/CoreModelsTests" + ), + + .testTarget( + name: "CoreInjectionTests", + dependencies: ["CoreInjection"], + path: "Tests/CoreInjectionTests" + ), + + .testTarget( + name: "CorePermissionsTests", + dependencies: ["CorePermissions"], + path: "Tests/CorePermissionsTests" + ), + + .testTarget( + name: "CoreSettingsTests", + dependencies: ["CoreSettings"], + path: "Tests/CoreSettingsTests" + ), + + .testTarget( + name: "CoreUtilsTests", + dependencies: ["CoreUtils"], + path: "Tests/CoreUtilsTests" + ) + ] +) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..69f4426 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +# Menu-Whisper + +A macOS menu bar application that provides offline speech-to-text transcription using Whisper-family models and automatically inserts the transcribed text into the currently focused application. + +## Overview + +Menu-Whisper is designed to be a privacy-focused, offline-first speech recognition tool for macOS. It runs entirely locally on Apple Silicon machines, requiring no internet connection during normal operation (only for initial model downloads). + +### Key Features + +- **100% Offline Operation**: Audio and text never leave your device +- **Apple Silicon Optimized**: Built specifically for M1/M2/M3 processors with Metal acceleration +- **Global Hotkey Support**: Default ⌘⇧V (configurable) +- **Smart Text Insertion**: Clipboard paste with typing fallback +- **Secure Input Detection**: Respects password fields and secure contexts +- **Multiple Models**: Support for various Whisper model sizes and variants +- **Multilingual**: Spanish and English interface and recognition + +## Requirements + +- **macOS**: 13.0 (Ventura) or later +- **Hardware**: Apple Silicon (M1, M2, or M3 processor) - Intel Macs are not supported +- **Xcode**: 15.0+ for building from source +- **Permissions**: Microphone, Accessibility, and Input Monitoring access + +## Build Requirements + +### Development Environment +- macOS 13+ with Xcode 15.0+ +- Swift 5.9+ +- Swift Package Manager (included with Xcode) + +### System Dependencies +- AVFoundation framework (audio capture) +- Carbon framework (global hotkeys) +- AppKit/SwiftUI (UI components) + +### Third-party Dependencies +- whisper.cpp (C/C++ library for speech recognition with Metal support) + +## Installation + +**Note**: This project is currently in development. Pre-built binaries will be available as signed and notarized .dmg files once complete. + +### Building from Source + +1. Clone the repository: + ```bash + git clone + cd tellme + ``` + +2. Open the project in Xcode or use Swift Package Manager: + ```bash + swift build -c release + ``` + +3. For development, open `Package.swift` in Xcode. + +## Architecture + +The application is structured with modular components: +- **App**: SwiftUI interface with AppKit bridges +- **Core/Audio**: AVAudioEngine capture and processing +- **Core/STT**: Speech-to-text engines (whisper.cpp, future Core ML) +- **Core/Models**: Model management and downloads +- **Core/Injection**: Text insertion with secure input handling +- **Core/Permissions**: System permission management +- **Core/Settings**: User preferences and configuration + +## Privacy & Security + +- **No Telemetry**: Zero data collection or remote analytics +- **Local Processing**: All audio processing happens on-device +- **Secure Input Respect**: Automatically detects and respects secure input contexts +- **Permission-Based**: Requires explicit user consent for system access + +## Development Status + +This project is currently in active development following a phased approach: +- Phase 0: Project scaffolding ⬅️ **Current** +- Phase 1: Hotkey + HUD + Audio capture +- Phase 2: STT integration with whisper.cpp +- Phase 3: Text insertion system +- Phase 4: Preferences and UX polish +- Phase 5: Distribution and packaging + +See `TODO.md` for detailed development progress and `TECHSPEC.md` for complete technical specifications. + +## License + +MIT License - see [LICENSE](LICENSE) for details. + +## Contributing + +This project follows a structured development approach with clear phases and acceptance criteria. Please refer to the technical specification and TODO list before contributing. \ No newline at end of file diff --git a/Resources/Localizations/en.lproj/Localizable.strings b/Resources/Localizations/en.lproj/Localizable.strings new file mode 100644 index 0000000..13d40ad --- /dev/null +++ b/Resources/Localizations/en.lproj/Localizable.strings @@ -0,0 +1,77 @@ +/* Menu-Whisper - English Localization */ + +/* General */ +"app.name" = "Menu-Whisper"; +"general.ok" = "OK"; +"general.cancel" = "Cancel"; +"general.continue" = "Continue"; +"general.settings" = "Settings"; +"general.quit" = "Quit"; + +/* Menu Bar */ +"menubar.idle" = "Idle"; +"menubar.listening" = "Listening"; +"menubar.processing" = "Processing"; +"menubar.preferences" = "Preferences..."; +"menubar.quit" = "Quit Menu-Whisper"; + +/* HUD States */ +"hud.listening" = "Listening..."; +"hud.processing" = "Transcribing..."; +"hud.cancel" = "Press Esc to cancel"; + +/* Permissions */ +"permissions.microphone.title" = "Microphone Access Required"; +"permissions.microphone.message" = "Menu-Whisper needs access to your microphone to perform speech-to-text transcription."; +"permissions.accessibility.title" = "Accessibility Access Required"; +"permissions.accessibility.message" = "Menu-Whisper needs Accessibility access to insert transcribed text into applications."; +"permissions.input_monitoring.title" = "Input Monitoring Required"; +"permissions.input_monitoring.message" = "Menu-Whisper needs Input Monitoring access to register global hotkeys."; +"permissions.open_settings" = "Open System Settings"; + +/* Preferences Window */ +"preferences.title" = "Menu-Whisper Preferences"; +"preferences.general" = "General"; +"preferences.models" = "Models"; +"preferences.hotkeys" = "Hotkeys"; +"preferences.insertion" = "Text Insertion"; +"preferences.advanced" = "Advanced"; + +/* General Preferences */ +"preferences.general.hotkey" = "Global Hotkey:"; +"preferences.general.mode" = "Activation Mode:"; +"preferences.general.mode.push_to_talk" = "Push-to-talk"; +"preferences.general.mode.toggle" = "Toggle"; +"preferences.general.sounds" = "Play sounds for start/stop"; +"preferences.general.limit" = "Dictation time limit (minutes):"; + +/* Model Preferences */ +"preferences.models.title" = "Speech Recognition Models"; +"preferences.models.active" = "Active Model:"; +"preferences.models.language" = "Language:"; +"preferences.models.language.auto" = "Auto-detect"; +"preferences.models.download" = "Download"; +"preferences.models.delete" = "Delete"; +"preferences.models.size" = "Size:"; +"preferences.models.languages" = "Languages:"; + +/* Insertion Preferences */ +"preferences.insertion.method" = "Insertion Method:"; +"preferences.insertion.method.paste" = "Paste (⌘V)"; +"preferences.insertion.method.type" = "Type characters"; +"preferences.insertion.preview" = "Show preview before inserting"; +"preferences.insertion.secure_input" = "Secure Input Detected"; +"preferences.insertion.secure_input.message" = "Text insertion is disabled in secure contexts. Text has been copied to clipboard."; + +/* Errors */ +"error.audio.failed" = "Failed to access microphone"; +"error.model.not_found" = "Speech recognition model not found"; +"error.model.load_failed" = "Failed to load speech recognition model"; +"error.transcription.failed" = "Speech transcription failed"; +"error.download.failed" = "Model download failed"; +"error.download.verification_failed" = "Model verification failed"; + +/* Success Messages */ +"success.model.downloaded" = "Model downloaded successfully"; +"success.settings.exported" = "Settings exported successfully"; +"success.settings.imported" = "Settings imported successfully"; \ No newline at end of file diff --git a/Resources/Localizations/es.lproj/Localizable.strings b/Resources/Localizations/es.lproj/Localizable.strings new file mode 100644 index 0000000..2c873d4 --- /dev/null +++ b/Resources/Localizations/es.lproj/Localizable.strings @@ -0,0 +1,77 @@ +/* Menu-Whisper - Spanish Localization */ + +/* General */ +"app.name" = "Menu-Whisper"; +"general.ok" = "Aceptar"; +"general.cancel" = "Cancelar"; +"general.continue" = "Continuar"; +"general.settings" = "Configuración"; +"general.quit" = "Salir"; + +/* Menu Bar */ +"menubar.idle" = "Inactivo"; +"menubar.listening" = "Escuchando"; +"menubar.processing" = "Procesando"; +"menubar.preferences" = "Preferencias..."; +"menubar.quit" = "Salir de Menu-Whisper"; + +/* HUD States */ +"hud.listening" = "Escuchando..."; +"hud.processing" = "Transcribiendo..."; +"hud.cancel" = "Presiona Esc para cancelar"; + +/* Permissions */ +"permissions.microphone.title" = "Acceso al Micrófono Requerido"; +"permissions.microphone.message" = "Menu-Whisper necesita acceso a tu micrófono para realizar la transcripción de voz a texto."; +"permissions.accessibility.title" = "Acceso de Accesibilidad Requerido"; +"permissions.accessibility.message" = "Menu-Whisper necesita acceso de Accesibilidad para insertar texto transcrito en aplicaciones."; +"permissions.input_monitoring.title" = "Monitoreo de Entrada Requerido"; +"permissions.input_monitoring.message" = "Menu-Whisper necesita acceso de Monitoreo de Entrada para registrar atajos de teclado globales."; +"permissions.open_settings" = "Abrir Configuración del Sistema"; + +/* Preferences Window */ +"preferences.title" = "Preferencias de Menu-Whisper"; +"preferences.general" = "General"; +"preferences.models" = "Modelos"; +"preferences.hotkeys" = "Atajos"; +"preferences.insertion" = "Inserción de Texto"; +"preferences.advanced" = "Avanzado"; + +/* General Preferences */ +"preferences.general.hotkey" = "Atajo Global:"; +"preferences.general.mode" = "Modo de Activación:"; +"preferences.general.mode.push_to_talk" = "Presionar para hablar"; +"preferences.general.mode.toggle" = "Alternar"; +"preferences.general.sounds" = "Reproducir sonidos al iniciar/detener"; +"preferences.general.limit" = "Límite de tiempo de dictado (minutos):"; + +/* Model Preferences */ +"preferences.models.title" = "Modelos de Reconocimiento de Voz"; +"preferences.models.active" = "Modelo Activo:"; +"preferences.models.language" = "Idioma:"; +"preferences.models.language.auto" = "Detección automática"; +"preferences.models.download" = "Descargar"; +"preferences.models.delete" = "Eliminar"; +"preferences.models.size" = "Tamaño:"; +"preferences.models.languages" = "Idiomas:"; + +/* Insertion Preferences */ +"preferences.insertion.method" = "Método de Inserción:"; +"preferences.insertion.method.paste" = "Pegar (⌘V)"; +"preferences.insertion.method.type" = "Escribir caracteres"; +"preferences.insertion.preview" = "Mostrar vista previa antes de insertar"; +"preferences.insertion.secure_input" = "Entrada Segura Detectada"; +"preferences.insertion.secure_input.message" = "La inserción de texto está deshabilitada en contextos seguros. El texto se ha copiado al portapapeles."; + +/* Errors */ +"error.audio.failed" = "Error al acceder al micrófono"; +"error.model.not_found" = "Modelo de reconocimiento de voz no encontrado"; +"error.model.load_failed" = "Error al cargar el modelo de reconocimiento de voz"; +"error.transcription.failed" = "Error en la transcripción de voz"; +"error.download.failed" = "Error en la descarga del modelo"; +"error.download.verification_failed" = "Error en la verificación del modelo"; + +/* Success Messages */ +"success.model.downloaded" = "Modelo descargado exitosamente"; +"success.settings.exported" = "Configuración exportada exitosamente"; +"success.settings.imported" = "Configuración importada exitosamente"; \ No newline at end of file diff --git a/Scripts/build.sh b/Scripts/build.sh new file mode 100755 index 0000000..530cc11 --- /dev/null +++ b/Scripts/build.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Build script for Menu-Whisper +# This script builds the project using Swift Package Manager + +set -e + +echo "🔨 Building Menu-Whisper..." + +# Clean previous build +echo "🧹 Cleaning previous build..." +swift package clean + +# Build in release mode +echo "⚡ Building in release mode..." +swift build -c release + +# Run tests +echo "🧪 Running tests..." +swift test + +# Check if SwiftFormat is available and run it +if command -v swiftformat >/dev/null 2>&1; then + echo "📝 Checking code formatting..." + swiftformat --lint . +else + echo "⚠️ SwiftFormat not available, skipping format check" +fi + +# Check if SwiftLint is available and run it +if command -v swiftlint >/dev/null 2>&1; then + echo "🔍 Running SwiftLint..." + swiftlint +else + echo "⚠️ SwiftLint not available, skipping lint check" +fi + +echo "✅ Build completed successfully!" \ No newline at end of file diff --git a/Scripts/notarize.sh b/Scripts/notarize.sh new file mode 100755 index 0000000..f9a23e2 --- /dev/null +++ b/Scripts/notarize.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Notarization script for Menu-Whisper +# This is a placeholder script that will be completed in Phase 5 + +set -e + +echo "🍎 Menu-Whisper Notarization Script" +echo "📋 This script will handle code signing and notarization for distribution" +echo "" +echo "⚠️ This is a placeholder script - implementation pending Phase 5" +echo "" +echo "📝 Steps that will be implemented:" +echo " 1. Code signing with Developer ID" +echo " 2. Creating .app bundle" +echo " 3. Notarization with Apple" +echo " 4. Stapling notarization ticket" +echo " 5. Creating .dmg for distribution" +echo "" +echo "🔧 Usage (when implemented):" +echo " ./Scripts/notarize.sh [--developer-id YOUR_TEAM_ID]" +echo "" + +# Placeholder for future implementation +# TODO: Implement in Phase 5 +# - Set up code signing identity +# - Configure entitlements +# - Build .app bundle +# - Submit for notarization +# - Wait for approval +# - Staple ticket +# - Create DMG + +echo "❌ Not implemented yet - use in Phase 5" +exit 1 \ No newline at end of file diff --git a/Sources/App/main.swift b/Sources/App/main.swift new file mode 100644 index 0000000..6f3323f --- /dev/null +++ b/Sources/App/main.swift @@ -0,0 +1,18 @@ +import SwiftUI + +@main +struct MenuWhisperApp: App { + var body: some Scene { + MenuBarExtra("Menu-Whisper", systemImage: "mic") { + Text("Menu-Whisper") + Text("Idle") + Divider() + Button("Preferences...") { + // TODO: Open preferences + } + Button("Quit") { + NSApplication.shared.terminate(nil) + } + } + } +} \ No newline at end of file diff --git a/Sources/CoreAudio/AudioEngine.swift b/Sources/CoreAudio/AudioEngine.swift new file mode 100644 index 0000000..e0f2622 --- /dev/null +++ b/Sources/CoreAudio/AudioEngine.swift @@ -0,0 +1,42 @@ +import Foundation +import AVFoundation +import CoreUtils + +public protocol AudioEngineDelegate: AnyObject { + func audioEngine(_ engine: AudioEngine, didUpdateLevel level: Float) + func audioEngine(_ engine: AudioEngine, didCaptureAudio data: Data) + func audioEngineDidStartCapture(_ engine: AudioEngine) + func audioEngineDidStopCapture(_ engine: AudioEngine) +} + +public class AudioEngine: ObservableObject { + private let logger = Logger(category: "AudioEngine") + private let audioEngine = AVAudioEngine() + + public weak var delegate: AudioEngineDelegate? + + @Published public private(set) var isCapturing = false + @Published public private(set) var currentLevel: Float = 0.0 + + public init() { + // Audio engine initialization will be completed in Phase 1 + } + + public func startCapture() throws { + logger.info("Starting audio capture") + // TODO: Implement in Phase 1 + isCapturing = true + delegate?.audioEngineDidStartCapture(self) + } + + public func stopCapture() { + logger.info("Stopping audio capture") + // TODO: Implement in Phase 1 + isCapturing = false + delegate?.audioEngineDidStopCapture(self) + } + + private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) { + // TODO: Implement RMS calculation and audio processing in Phase 1 + } +} \ No newline at end of file diff --git a/Sources/CoreInjection/TextInjector.swift b/Sources/CoreInjection/TextInjector.swift new file mode 100644 index 0000000..44fed81 --- /dev/null +++ b/Sources/CoreInjection/TextInjector.swift @@ -0,0 +1,73 @@ +import Foundation +import AppKit +import CoreUtils + +public enum InjectionMethod { + case paste + case typing +} + +public enum InjectionError: Error, LocalizedError { + case secureInputActive + case accessibilityPermissionRequired + case injectionFailed(String) + + public var errorDescription: String? { + switch self { + case .secureInputActive: + return NSLocalizedString("preferences.insertion.secure_input.message", comment: "Secure input message") + case .accessibilityPermissionRequired: + return NSLocalizedString("permissions.accessibility.message", comment: "Accessibility permission message") + case .injectionFailed(let reason): + return "Text injection failed: \(reason)" + } + } +} + +public class TextInjector { + private let logger = Logger(category: "TextInjector") + + public init() {} + + public func injectText(_ text: String, method: InjectionMethod = .paste) throws { + logger.info("Injecting text using method: \(method)") + + // Check for secure input first + if isSecureInputActive() { + // Copy to clipboard but don't inject + copyToClipboard(text) + throw InjectionError.secureInputActive + } + + switch method { + case .paste: + try injectViaPaste(text) + case .typing: + try injectViaTyping(text) + } + } + + private func injectViaPaste(_ text: String) throws { + logger.debug("Injecting text via paste method") + // TODO: Implement paste injection (clipboard + ⌘V) in Phase 3 + copyToClipboard(text) + // TODO: Send ⌘V via CGEvent + } + + private func injectViaTyping(_ text: String) throws { + logger.debug("Injecting text via typing method") + // TODO: Implement character-by-character typing via CGEvent in Phase 3 + } + + private func copyToClipboard(_ text: String) { + let pasteboard = NSPasteboard.general + pasteboard.clearContents() + pasteboard.setString(text, forType: .string) + logger.debug("Text copied to clipboard") + } + + private func isSecureInputActive() -> Bool { + // TODO: Implement IsSecureEventInputEnabled() check in Phase 3 + return false + } +} \ No newline at end of file diff --git a/Sources/CoreModels/ModelManager.swift b/Sources/CoreModels/ModelManager.swift new file mode 100644 index 0000000..9ae9f7d --- /dev/null +++ b/Sources/CoreModels/ModelManager.swift @@ -0,0 +1,70 @@ +import Foundation +import CoreUtils + +public struct ModelInfo: Codable, Identifiable { + public let id = UUID() + public let name: String + public let family: String + public let format: String + public let sizeMB: Int + public let languages: [String] + public let recommendedBackend: String + public let qualityTier: String + public let license: String + public let sha256: String + public let downloadURL: String + public let notes: String + + enum CodingKeys: String, CodingKey { + case name, family, format, languages, license, sha256, notes + case sizeMB = "size_mb" + case recommendedBackend = "recommended_backend" + case qualityTier = "quality_tier" + case downloadURL = "download_url" + } +} + +public class ModelManager: ObservableObject { + private let logger = Logger(category: "ModelManager") + + @Published public private(set) var availableModels: [ModelInfo] = [] + @Published public private(set) var downloadedModels: [ModelInfo] = [] + @Published public private(set) var activeModel: ModelInfo? + + private let modelsDirectory: URL + + public init() { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + modelsDirectory = appSupport.appendingPathComponent("MenuWhisper/Models") + + try? FileManager.default.createDirectory(at: modelsDirectory, withIntermediateDirectories: true) + loadModelCatalog() + refreshDownloadedModels() + } + + public func downloadModel(_ model: ModelInfo) async throws { + logger.info("Starting download for model: \(model.name)") + // TODO: Implement model download with progress tracking and SHA256 verification in Phase 2 + } + + public func deleteModel(_ model: ModelInfo) throws { + logger.info("Deleting model: \(model.name)") + // TODO: Implement model deletion in Phase 2 + } + + public func setActiveModel(_ model: ModelInfo) { + logger.info("Setting active model: \(model.name)") + activeModel = model + // TODO: Persist active model selection in Phase 2 + } + + private func loadModelCatalog() { + // TODO: Load curated model catalog from bundled JSON in Phase 2 + logger.info("Loading model catalog") + } + + private func refreshDownloadedModels() { + // TODO: Scan models directory and populate downloadedModels in Phase 2 + logger.info("Refreshing downloaded models") + } +} \ No newline at end of file diff --git a/Sources/CorePermissions/PermissionManager.swift b/Sources/CorePermissions/PermissionManager.swift new file mode 100644 index 0000000..ece6620 --- /dev/null +++ b/Sources/CorePermissions/PermissionManager.swift @@ -0,0 +1,111 @@ +import Foundation +import AVFoundation +import AppKit +import CoreUtils + +public enum PermissionType: CaseIterable { + case microphone + case accessibility + case inputMonitoring +} + +public enum PermissionStatus { + case notDetermined + case granted + case denied + case restricted +} + +public class PermissionManager: ObservableObject { + private let logger = Logger(category: "PermissionManager") + + @Published public private(set) var microphoneStatus: PermissionStatus = .notDetermined + @Published public private(set) var accessibilityStatus: PermissionStatus = .notDetermined + @Published public private(set) var inputMonitoringStatus: PermissionStatus = .notDetermined + + public init() { + refreshAllPermissions() + } + + public func requestMicrophonePermission() async -> PermissionStatus { + logger.info("Requesting microphone permission") + + return await withCheckedContinuation { continuation in + switch AVCaptureDevice.authorizationStatus(for: .audio) { + case .authorized: + continuation.resume(returning: .granted) + case .denied, .restricted: + continuation.resume(returning: .denied) + case .notDetermined: + AVCaptureDevice.requestAccess(for: .audio) { granted in + let status: PermissionStatus = granted ? .granted : .denied + Task { @MainActor in + self.microphoneStatus = status + } + continuation.resume(returning: status) + } + @unknown default: + continuation.resume(returning: .notDetermined) + } + } + } + + public func requestAccessibilityPermission() { + logger.info("Requesting accessibility permission") + // TODO: Implement accessibility permission request in Phase 1 + // This typically involves guiding the user to System Settings + } + + public func requestInputMonitoringPermission() { + logger.info("Requesting input monitoring permission") + // TODO: Implement input monitoring permission request in Phase 1 + // This typically involves guiding the user to System Settings + } + + public func openSystemSettings(for permission: PermissionType) { + logger.info("Opening system settings for permission: \(permission)") + + let urlString: String + switch permission { + case .microphone: + urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Microphone" + case .accessibility: + urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility" + case .inputMonitoring: + urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_ListenEvent" + } + + if let url = URL(string: urlString) { + NSWorkspace.shared.open(url) + } + } + + private func refreshAllPermissions() { + refreshMicrophonePermission() + refreshAccessibilityPermission() + refreshInputMonitoringPermission() + } + + private func refreshMicrophonePermission() { + switch AVCaptureDevice.authorizationStatus(for: .audio) { + case .notDetermined: + microphoneStatus = .notDetermined + case .authorized: + microphoneStatus = .granted + case .denied, .restricted: + microphoneStatus = .denied + @unknown default: + microphoneStatus = .notDetermined + } + } + + private func refreshAccessibilityPermission() { + // TODO: Implement accessibility permission check in Phase 1 + accessibilityStatus = .notDetermined + } + + private func refreshInputMonitoringPermission() { + // TODO: Implement input monitoring permission check in Phase 1 + inputMonitoringStatus = .notDetermined + } +} \ No newline at end of file diff --git a/Sources/CoreSTT/STTEngine.swift b/Sources/CoreSTT/STTEngine.swift new file mode 100644 index 0000000..ae83914 --- /dev/null +++ b/Sources/CoreSTT/STTEngine.swift @@ -0,0 +1,32 @@ +import Foundation +import CoreUtils + +public protocol STTEngine { + func transcribe(audioData: Data, language: String?) async throws -> String + func isModelLoaded() -> Bool + func loadModel(at path: URL) async throws + func unloadModel() +} + +public enum STTError: Error, LocalizedError { + case modelNotFound + case modelLoadFailed(String) + case transcriptionFailed(String) + case unsupportedFormat + case invalidAudioData + + public var errorDescription: String? { + switch self { + case .modelNotFound: + return NSLocalizedString("error.model.not_found", comment: "Model not found error") + case .modelLoadFailed(let reason): + return NSLocalizedString("error.model.load_failed", comment: "Model load failed error") + ": \(reason)" + case .transcriptionFailed(let reason): + return NSLocalizedString("error.transcription.failed", comment: "Transcription failed error") + ": \(reason)" + case .unsupportedFormat: + return "Unsupported audio format" + case .invalidAudioData: + return "Invalid audio data" + } + } +} \ No newline at end of file diff --git a/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift b/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift new file mode 100644 index 0000000..870cf1a --- /dev/null +++ b/Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift @@ -0,0 +1,35 @@ +import Foundation +import CoreUtils + +public class WhisperCPPEngine: STTEngine { + private let logger = Logger(category: "WhisperCPPEngine") + private var modelPath: URL? + private var isLoaded = false + + public init() { + // WhisperCPP integration will be implemented in Phase 2 + } + + public func transcribe(audioData: Data, language: String?) async throws -> String { + logger.info("Transcribing audio data") + // TODO: Implement whisper.cpp integration in Phase 2 + throw STTError.transcriptionFailed("Not implemented yet") + } + + public func isModelLoaded() -> Bool { + return isLoaded + } + + public func loadModel(at path: URL) async throws { + logger.info("Loading model at path: \(path.path)") + self.modelPath = path + // TODO: Implement model loading in Phase 2 + isLoaded = true + } + + public func unloadModel() { + logger.info("Unloading model") + modelPath = nil + isLoaded = false + } +} \ No newline at end of file diff --git a/Sources/CoreSettings/Settings.swift b/Sources/CoreSettings/Settings.swift new file mode 100644 index 0000000..be04b8a --- /dev/null +++ b/Sources/CoreSettings/Settings.swift @@ -0,0 +1,149 @@ +import Foundation +import CoreUtils + +public enum HotkeyMode: String, CaseIterable, Codable { + case pushToTalk = "push_to_talk" + case toggle = "toggle" + + public var displayName: String { + switch self { + case .pushToTalk: + return NSLocalizedString("preferences.general.mode.push_to_talk", comment: "Push to talk mode") + case .toggle: + return NSLocalizedString("preferences.general.mode.toggle", comment: "Toggle mode") + } + } +} + +public struct HotkeyConfig: Codable { + public let keyCode: UInt32 + public let modifiers: UInt32 + + public init(keyCode: UInt32, modifiers: UInt32) { + self.keyCode = keyCode + self.modifiers = modifiers + } + + // Default to ⌘⇧V + public static let `default` = HotkeyConfig(keyCode: 9, modifiers: 768) // V key with Cmd+Shift +} + +public class Settings: ObservableObject { + private let logger = Logger(category: "Settings") + private let userDefaults = UserDefaults.standard + + // General Settings + @Published public var hotkey: HotkeyConfig { + didSet { saveHotkey() } + } + + @Published public var hotkeyMode: HotkeyMode { + didSet { saveHotkeyMode() } + } + + @Published public var playSounds: Bool { + didSet { userDefaults.set(playSounds, forKey: "playSounds") } + } + + @Published public var dictationTimeLimit: TimeInterval { + didSet { userDefaults.set(dictationTimeLimit, forKey: "dictationTimeLimit") } + } + + // Model Settings + @Published public var activeModelName: String? { + didSet { userDefaults.set(activeModelName, forKey: "activeModelName") } + } + + @Published public var forcedLanguage: String? { + didSet { userDefaults.set(forcedLanguage, forKey: "forcedLanguage") } + } + + // Insertion Settings + @Published public var insertionMethod: String { + didSet { userDefaults.set(insertionMethod, forKey: "insertionMethod") } + } + + @Published public var showPreview: Bool { + didSet { userDefaults.set(showPreview, forKey: "showPreview") } + } + + public init() { + // Load settings from UserDefaults + self.hotkey = Settings.loadHotkey() + self.hotkeyMode = HotkeyMode(rawValue: userDefaults.string(forKey: "hotkeyMode") ?? "") ?? .pushToTalk + self.playSounds = userDefaults.object(forKey: "playSounds") as? Bool ?? false + self.dictationTimeLimit = userDefaults.object(forKey: "dictationTimeLimit") as? TimeInterval ?? 600 // 10 minutes + self.activeModelName = userDefaults.string(forKey: "activeModelName") + self.forcedLanguage = userDefaults.string(forKey: "forcedLanguage") + self.insertionMethod = userDefaults.string(forKey: "insertionMethod") ?? "paste" + self.showPreview = userDefaults.object(forKey: "showPreview") as? Bool ?? false + + logger.info("Settings initialized") + } + + public func exportSettings() throws -> Data { + let settingsDict: [String: Any] = [ + "hotkeyKeyCode": hotkey.keyCode, + "hotkeyModifiers": hotkey.modifiers, + "hotkeyMode": hotkeyMode.rawValue, + "playSounds": playSounds, + "dictationTimeLimit": dictationTimeLimit, + "activeModelName": activeModelName as Any, + "forcedLanguage": forcedLanguage as Any, + "insertionMethod": insertionMethod, + "showPreview": showPreview + ] + + return try JSONSerialization.data(withJSONObject: settingsDict, options: .prettyPrinted) + } + + public func importSettings(from data: Data) throws { + let settingsDict = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:] + + if let keyCode = settingsDict["hotkeyKeyCode"] as? UInt32, + let modifiers = settingsDict["hotkeyModifiers"] as? UInt32 { + hotkey = HotkeyConfig(keyCode: keyCode, modifiers: modifiers) + } + + if let modeString = settingsDict["hotkeyMode"] as? String, + let mode = HotkeyMode(rawValue: modeString) { + hotkeyMode = mode + } + + if let sounds = settingsDict["playSounds"] as? Bool { + playSounds = sounds + } + + if let timeLimit = settingsDict["dictationTimeLimit"] as? TimeInterval { + dictationTimeLimit = timeLimit + } + + activeModelName = settingsDict["activeModelName"] as? String + forcedLanguage = settingsDict["forcedLanguage"] as? String + + if let method = settingsDict["insertionMethod"] as? String { + insertionMethod = method + } + + if let preview = settingsDict["showPreview"] as? Bool { + showPreview = preview + } + + logger.info("Settings imported successfully") + } + + private static func loadHotkey() -> HotkeyConfig { + let keyCode = UserDefaults.standard.object(forKey: "hotkeyKeyCode") as? UInt32 ?? HotkeyConfig.default.keyCode + let modifiers = UserDefaults.standard.object(forKey: "hotkeyModifiers") as? UInt32 ?? HotkeyConfig.default.modifiers + return HotkeyConfig(keyCode: keyCode, modifiers: modifiers) + } + + private func saveHotkey() { + userDefaults.set(hotkey.keyCode, forKey: "hotkeyKeyCode") + userDefaults.set(hotkey.modifiers, forKey: "hotkeyModifiers") + } + + private func saveHotkeyMode() { + userDefaults.set(hotkeyMode.rawValue, forKey: "hotkeyMode") + } +} \ No newline at end of file diff --git a/Sources/CoreUtils/AppState.swift b/Sources/CoreUtils/AppState.swift new file mode 100644 index 0000000..5dee8d9 --- /dev/null +++ b/Sources/CoreUtils/AppState.swift @@ -0,0 +1,24 @@ +import Foundation + +public enum AppState: String, CaseIterable { + case idle = "idle" + case listening = "listening" + case processing = "processing" + case injecting = "injecting" + case error = "error" + + public var displayName: String { + switch self { + case .idle: + return NSLocalizedString("menubar.idle", comment: "Idle state") + case .listening: + return NSLocalizedString("menubar.listening", comment: "Listening state") + case .processing: + return NSLocalizedString("menubar.processing", comment: "Processing state") + case .injecting: + return "Injecting" // Not shown in menu bar + case .error: + return "Error" // Not shown in menu bar + } + } +} \ No newline at end of file diff --git a/Sources/CoreUtils/Logger.swift b/Sources/CoreUtils/Logger.swift new file mode 100644 index 0000000..4f25faa --- /dev/null +++ b/Sources/CoreUtils/Logger.swift @@ -0,0 +1,51 @@ +import Foundation +import os.log + +public enum LogLevel: String, CaseIterable { + case debug = "DEBUG" + case info = "INFO" + case warning = "WARNING" + case error = "ERROR" +} + +public class Logger { + private let osLog: OSLog + private let category: String + + public init(category: String) { + self.category = category + self.osLog = OSLog(subsystem: "com.menuwhisper.app", category: category) + } + + public func debug(_ message: String, file: String = #file, function: String = #function, line: Int = #line) { + log(level: .debug, message: message, file: file, function: function, line: line) + } + + public func info(_ message: String, file: String = #file, function: String = #function, line: Int = #line) { + log(level: .info, message: message, file: file, function: function, line: line) + } + + public func warning(_ message: String, file: String = #file, function: String = #function, line: Int = #line) { + log(level: .warning, message: message, file: file, function: function, line: line) + } + + public func error(_ message: String, file: String = #file, function: String = #function, line: Int = #line) { + log(level: .error, message: message, file: file, function: function, line: line) + } + + private func log(level: LogLevel, message: String, file: String, function: String, line: Int) { + let fileName = URL(fileURLWithPath: file).lastPathComponent + let logMessage = "[\(category)] \(message) (\(fileName):\(function):\(line))" + + switch level { + case .debug: + os_log("%{public}@", log: osLog, type: .debug, logMessage) + case .info: + os_log("%{public}@", log: osLog, type: .info, logMessage) + case .warning: + os_log("%{public}@", log: osLog, type: .default, logMessage) + case .error: + os_log("%{public}@", log: osLog, type: .error, logMessage) + } + } +} \ No newline at end of file diff --git a/TECHSPEC.md b/TECHSPEC.md new file mode 100644 index 0000000..fccbb7d --- /dev/null +++ b/TECHSPEC.md @@ -0,0 +1,335 @@ +# Technical Definition — “Menu-Whisper” (macOS, Swift, Offline STT) + +## 0) Owner Decisions (Locked) +- **Platform:** Apple Silicon only (M1/M2/M3), macOS 13+. +- **STT backends:** Start with **whisper.cpp (Metal)** for simplicity; add **Core ML** backend later. +- **Models:** Do **not** auto-download. On first run, user **chooses & downloads** a model. +- **VAD:** Post-MVP. +- **Insertion behavior:** Configurable; **direct insertion** is default (no preview). +- **Default hotkey:** **⌘⇧V** (user-configurable). +- **Punctuation:** Let the model handle punctuation automatically (no spoken commands). +- **Privacy/Connectivity:** 100% local at runtime; model downloads only when the user explicitly requests. **No telemetry**. +- **Distribution:** **.app/.dmg** (signed + notarized), outside the Mac App Store initially. +- **UI languages:** **ES/EN**. +- **Low-power mode:** Still allow downloads if the user starts them. +- **License:** **MIT**. +- **Per-dictation limit:** **10 minutes** by default (configurable). + +--- + +## 1) Goal +A **menu bar** app for macOS that performs **offline speech-to-text** using Whisper-family models and **inserts the transcribed text** into whichever app currently has focus. Shows a minimal **HUD** while listening and processing. No internet required during normal operation. + +--- + +## 2) MVP Scope +- Persistent **menu bar** item (NSStatusItem / `MenuBarExtra`). +- **Global hotkey** (push-to-talk and toggle modes). +- **HUD** (centered NSPanel + SwiftUI): + - “Listening” with audio-level animation (RMS/peak). + - “Processing” with a spinner/animation. +- **Offline STT** with **whisper.cpp** (GGUF models; Metal acceleration on Apple Silicon). +- **Model Manager**: curated list, manual download with progress + SHA256 check, user selection. +- **Text injection**: + - Preferred: **Clipboard + ⌘V** paste. + - Fallback: **simulated typing** via CGEvent. + - If **Secure Input** is active, **do not inject**; show notice and keep text on clipboard. +- **Preferences**: hotkey & mode, model & language, insertion method, HUD styling, sounds, dictation limit. +- **Permissions onboarding**: Microphone, Accessibility, Input Monitoring. + +--- + +## 3) Functional Requirements + +### 3.1 Capture +- Prompt for permissions on first use. +- Global hotkey (default ⌘⇧V). +- **Push-to-talk**: start on key down, stop on key up. +- **Toggle**: press to start, press again to stop. +- Per-dictation limit (default 10 min, range 10 s–30 min). + +### 3.2 HUD / UX +- Non-activating, centered **NSPanel** (~320×160), no focus stealing. +- **Listening**: bar-style audio visualization driven by live RMS/peak. +- **Processing**: spinner + “Transcribing…” label. +- **Esc** to cancel. +- Optional start/stop sounds (user-toggleable). + +### 3.3 STT +- Backend A (MVP): **whisper.cpp** with **GGUF** and **Metal**. +- Language: auto-detect or forced (persisted). +- Basic text normalization; punctuation from the model. +- UTF-8 output; standard replacements (quotes, dashes, etc.). + +### 3.4 Injection +- Preferred method: **NSPasteboard** + **CGEvent** to send ⌘V. +- Fallback: **CGEventCreateKeyboardEvent** (character-by-character), respecting active keyboard layout. +- **Secure Input**: detect with `IsSecureEventInputEnabled()`; if enabled, **do not inject**. Show a non-intrusive notice and leave the text on the clipboard. + +### 3.5 Preferences +- **General:** hotkey + mode (push/toggle), sounds, HUD options. +- **Models:** catalog, download, select active model, language, local storage path. +- **Insertion:** direct vs preview (preview **off** by default), paste vs type. +- **Advanced:** limits, performance knobs (threads/batch), **local** logs opt-in. + +--- + +## 4) Non-Functional Requirements +- **Offline** execution after models are installed. +- **Latency target** (M1 + “small” model): < 4 s for 10 s of audio. +- **Memory target:** ~1.5–2.5 GB with “small”. +- **Privacy:** audio and text never leave the device. +- **Accessibility:** sufficient contrast; VoiceOver labels; focus never stolen by HUD. + +--- + +## 5) Architecture (High-Level) +- **App (SwiftUI)** with AppKit bridges for NSStatusItem and NSPanel. +- **Shortcut Manager** (Carbon `RegisterEventHotKey` or HotKey/MASShortcut). +- **Audio**: AVAudioEngine (downsample to 16 kHz mono, 16-bit PCM). +- **STT Engine**: + - **whisper.cpp** (C/C++ via SPM/CMake) with Metal. + - **Core ML backend** (e.g., WhisperKit / custom) in a later phase. +- **Model Manager**: curated catalog, downloads (progress + SHA256), selection, caching. +- **Text Injection**: pasteboard + CGEvent; typing fallback; Secure Input detection. +- **Permissions Manager**: guided flows to System Settings panes. +- **Settings**: UserDefaults + JSON export/import. +- **Packaging**: .app + .dmg (signed & notarized). + +--- + +## 6) Main Flow +1. User presses global hotkey. +2. Check permissions; guide if missing. +3. Show HUD → **Listening**; start capture. +4. Stop (key up/toggle/timeout). +5. HUD → **Processing**; run STT in background. +6. On result → (optional preview) → **insert** (paste) or **fallback** (type). If Secure Input, **do not inject**; keep in clipboard + show notice. +7. Close HUD → **Idle**. + +--- + +## 7) Finite State Machine (FSM) +- **Idle** → (Hotkey) → **Listening** +- **Listening** → (Stop/Timeout) → **Processing** +- **Processing** → (Done) → **Injecting** +- **Injecting** → (Done) → **Idle** +- Any → (Error) → **ErrorModal** → **Idle** + +--- + +## 8) Model Management (Manual Downloads) +**Goal:** Offer a clear list of **free** Whisper-family models (names, sizes, languages, recommended backend) with one-click downloads. No automatic downloads. + +### 8.1 OpenAI Whisper (official weights) +- Families: **tiny**, **base**, **small**, **medium**, **large-v2**, **large-v3** (multilingual; some `.en` variants). +- Usable with **whisper.cpp** via **GGUF** (community conversions widely available). + +### 8.2 Whisper for whisper.cpp (converted GGUF) +- Community-maintained conversions for whisper.cpp (GGUF), optimized for CPU/GPU Metal on macOS. + +### 8.3 Faster-Whisper (CTranslate2) +- Optimized variants (tiny/base/small/medium/large-v2/large-v3). Useful if a CT2-based or Core-ML-assisted backend is added later. + +### 8.4 Distil-Whisper (distilled) +- Distilled models (e.g., **distil-large-v2/v3/v3.5**, **distil-small.en**), significantly smaller/faster with near-large accuracy. + +> **UI must show:** model file size, languages, license, **RAM estimate**, and a warning if a large model is selected on lower-memory machines. + +**Optional JSON Schema for catalog entries (for the app’s first-run picker):** + +```json +{ + "name": "whisper-small", + "family": "OpenAI-Whisper", + "format": "gguf", + "size_mb": 466, + "languages": ["multilingual"], + "recommended_backend": "whisper.cpp", + "quality_tier": "small", + "license": "MIT", + "sha256": "…", + "download_url": "…", + "notes": "Good balance of speed/accuracy on M1/M2." +} +``` + +--- + +## 9) Security & Permissions + +* **Info.plist:** `NSMicrophoneUsageDescription`. +* **Accessibility & Input Monitoring:** required for CGEvent; provide clear step-by-step guidance and deep-links. +* **Secure Input:** check `IsSecureEventInputEnabled()`; **never** attempt to bypass. Provide help text to identify apps that enable it (password fields, 2FA prompts, etc.). + +--- + +## 10) Performance + +* Lazy-load and reuse model (warm cache). +* Real-time downsampling to 16 kHz mono; chunked streaming into backend. +* Configurable threads; prefer **Metal** path on Apple Silicon. +* “Fast path” tweaks for short clips (<15 s). + +--- + +## 11) Logging & Privacy + +* **No remote telemetry.** +* Local logs **opt-in** (timings, errors only). Never store audio/text unless user explicitly enables a debug flag. +* “Wipe local data” button (models remain unless the user removes them). + +--- + +## 12) Internationalization + +* UI in **Spanish** and **English** (Localizable.strings). +* STT multilingual; language auto or forced per user preference. + +--- + +## 13) Testing (Minimum) + +* macOS 13/14/15 on M1/M2/M3. +* Injection works in Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail. +* **Secure Input**: correctly detected; no injection; clipboard + notice. +* Meet latency target with **small** model on M1. +* Model download & selection flows (simulate network errors). + +--- + +## 14) Phased Plan (AI-Deliverables) + +### Phase 0 — Scaffolding (MVP-0) + +**Goal:** Base project + menubar. +**Deliverables:** + +* SwiftUI app with `MenuBarExtra`, microphone icon, “Idle” state. +* `ARCHITECTURE.md` describing modules (Audio/STT/Injection/Models/Permissions/Settings). +* Build scripts and signing/notarization templates. + **DoD:** Compiles; menu bar item visible; SPM structure ready. + +--- + +### Phase 1 — Hotkey + HUD + Audio (MVP-1) + +**Goal:** Listening UX without real STT. +**Deliverables:** + +* Global hotkey (default ⌘⇧V) with **push** and **toggle**. +* NSPanel HUD (Listening/Processing) + **real** RMS bars from AVAudioEngine. +* Per-dictation limit (default 10 min). + **DoD:** Live meter responds to mic; correct state transitions. + +--- + +### Phase 2 — STT via whisper.cpp (MVP-2) + +**Goal:** Real offline transcription. +**Deliverables:** + +* **whisper.cpp** module (C/C++), background inference with **Metal**. +* **Model Manager** (curated list, download with SHA256, selection). +* Language auto/forced; basic normalization. + **DoD:** 10-second clip → coherent ES/EN text offline; meets timing targets. + +--- + +### Phase 3 — Robust Insertion (MVP-3) + +**Goal:** Reliable insertion into focused app. +**Deliverables:** + +* Paste (clipboard + ⌘V) and typing fallback. +* **Secure Input** detection; safe behavior (no injection, clipboard + notice). + **DoD:** Works across target apps; correct Secure Input handling. + +--- + +### Phase 4 — Preferences + UX Polish (MVP-4) + +**Goal:** Complete options & stability. +**Deliverables:** + +* Full Preferences (hotkey, modes, model, language, insertion, HUD, sounds). +* Optional preview dialog (off by default). +* Config export/import (JSON). + **DoD:** All settings persist and are honored. + +--- + +### Phase 5 — Distribution (MVP-5) + +**Goal:** Installable package. +**Deliverables:** + +* Error handling; permission prompts & help (incl. Secure Input troubleshooting). +* **.dmg** (signed + notarized) and install guide. +* **USER\_GUIDE.md** + **TROUBLESHOOTING.md**. + **DoD:** Clean install on test machines; distribution checklist passed. + +--- + +### Phase 6 — Core ML Backend (Post-MVP) + +**Goal:** Second backend. +**Deliverables:** + +* **Core ML** integration (e.g., WhisperKit or custom conversion). +* Backend selector (whisper.cpp/Core ML) in Preferences; local benchmarks table. + **DoD:** Feature parity and stability; documented pros/cons. + +--- + +## 15) Mini-Prompts for the Builder AI (per Phase) + +* **P0:** “Create macOS 13+ SwiftUI menubar app (`MenuBarExtra`), microphone icon, SPM layout with modules in `ARCHITECTURE.md`.” +* **P1:** “Add global hotkey (push & toggle) with `RegisterEventHotKey`; NSPanel HUD with RMS bars from AVAudioEngine; 10-minute dictation limit.” +* **P2:** “Integrate **whisper.cpp** (Metal); add Model Manager (curated list, SHA256-verified downloads, selection); language auto/forced; transcribe WAV 16 kHz mono.” +* **P3:** “Implement insertion: pasteboard+⌘V and CGEvent typing fallback; detect `IsSecureEventInputEnabled()` and avoid injection.” +* **P4:** “Implement full Preferences, optional preview, JSON export/import; UX polish and messages.” +* **P5:** “Signing + notarization; produce .dmg; write USER\_GUIDE and TROUBLESHOOTING (with Secure Input section).” +* **P6:** “Add Core ML backend (WhisperKit/custom), backend selector, and local benchmarks.” + +--- + +## 16) Suggested Repo Layout + +``` +MenuWhisper/ + Sources/ + App/ # SwiftUI + AppKit bridges + Core/ + Audio/ # AVAudioEngine capture + meters + STT/ + WhisperCPP/ # C/C++ wrapper + Metal path + CoreML/ # post-MVP + Models/ # catalog, downloads, hashes + Injection/ # clipboard, CGEvent typing, secure input checks + Permissions/ + Settings/ + Utils/ + Resources/ # icons, sounds, localizations + Docs/ # ARCHITECTURE.md, USER_GUIDE.md, TROUBLESHOOTING.md + Scripts/ # build, sign, notarize + Tests/ # unit + integration +``` + +--- + +## 17) Risks & Mitigations + +* **Hotkey collision (⌘⇧V)** with “Paste and Match Style” in some apps → make it discoverable & easily rebindable; warn on conflict. +* **Secure Input** blocks injection → inform the user, keep text on clipboard, provide help to identify the app enabling it. +* **RAM/latency** with large models → recommend **small/base** by default; show RAM/latency hints in the model picker. +* **Keyboard layouts** → prefer paste; if typing, map using the active layout. + +--- + +## 18) Global MVP Definition of Done + +* A 30–90 s dictation yields accurate ES/EN text **offline** and inserts correctly in common apps. +* Secure Input is correctly detected and handled. +* Model download/selection is robust and user-driven. +* Shippable **.dmg** (signed + notarized) and clear docs included. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..5a33d82 --- /dev/null +++ b/TODO.md @@ -0,0 +1,223 @@ +```markdown +# TODO — Menu-Whisper (macOS, Swift, Offline STT) + +This file tracks the tasks needed to deliver the app in **phases** with clear acceptance checks. +Conventions: +- `[ ]` = to do, `[x]` = done +- **AC** = Acceptance Criteria +- All features must work **offline** after models are installed. + +--- + +## Global / Project-Wide + +- [x] Set project license to **MIT** and add `LICENSE` file. +- [x] Add `README.md` with high-level summary, build requirements (Xcode, macOS 13+), Apple Silicon-only note. +- [x] Add `Docs/ARCHITECTURE.md` skeleton (to be filled in Phase 0). +- [x] Create base **localization** scaffolding (`en.lproj`, `es.lproj`) with `Localizable.strings`. +- [x] Add SwiftPM structure with separate targets for `App`, `Core/*` modules. +- [x] Prepare optional tooling: + - [x] SwiftFormat / SwiftLint config (opt-in). + - [x] GitHub Actions macOS runner for **build-only** CI (optional). + +--- + +## Phase 0 — Scaffolding (MVP-0) + +**Goal:** Base project + menu bar item; structure and docs. + +### Tasks +- [x] Create SwiftUI macOS app (macOS 13+) with `MenuBarExtra` / `NSStatusItem`. +- [x] Add placeholder mic icon (template asset). +- [x] Create module targets: + - [x] `Core/Audio` + - [x] `Core/STT` (with subfolders `WhisperCPP` and `CoreML` (stub)) + - [x] `Core/Models` + - [x] `Core/Injection` + - [x] `Core/Permissions` + - [x] `Core/Settings` + - [x] `Core/Utils` +- [x] Wire a minimal state machine: `Idle` state shown in menubar menu. +- [x] Add scripts: + - [x] `Scripts/build.sh` (SPM/Xcodebuild) + - [x] `Scripts/notarize.sh` (stub with placeholders for later) +- [x] Write `Docs/ARCHITECTURE.md` (modules, data flow, FSM diagram). + +### AC +- [x] Project compiles and shows a **menu bar** icon with a basic menu. +- [x] Repo has clear structure and architecture doc. + +--- + +## Phase 1 — Hotkey + HUD + Audio (MVP-1) + +**Goal:** Listening UX without real STT. + +### Tasks +- [ ] Implement **global hotkey** manager: + - [ ] Default **⌘⇧V** (configurable later). + - [ ] Support **push-to-talk** (start on key down, stop on key up). + - [ ] Support **toggle** (press to start, press to stop). +- [ ] Create **HUD** as non-activating centered `NSPanel`: + - [ ] State **Listening** with **RMS/peak bars** animation (SwiftUI view). + - [ ] State **Processing** with spinner/label. + - [ ] Dismiss/cancel with **Esc**. +- [ ] Implement **AVAudioEngine** capture: + - [ ] Tap on input bus; compute RMS/peak for visualization. + - [ ] Resample path ready for 16 kHz mono PCM (no STT yet). +- [ ] Add dictation **time limit** (default **10 min**, configurable later). +- [ ] Optional **sounds** for start/stop (toggle in settings later). +- [ ] Permissions onboarding: + - [ ] Request **Microphone** permission with Info.plist string. + - [ ] Show guide for **Accessibility** and **Input Monitoring** (no hard gating yet). + +### AC +- [ ] Hotkey works in both modes (push/toggle) across desktop & full-screen apps. +- [ ] HUD appears centered; **Listening** shows live bars; **Processing** shows spinner. +- [ ] Cancel (Esc) reliably stops listening and hides HUD. + +--- + +## Phase 2 — STT via whisper.cpp (MVP-2) + +**Goal:** Real offline transcription (Apple Silicon + Metal). + +### Tasks +- [ ] Add **whisper.cpp** integration: + - [ ] Vendor/SwiftPM/Wrapper target for C/C++. + - [ ] Build with **Metal** path enabled on Apple Silicon. + - [ ] Define `STTEngine` protocol and `WhisperCPPSTTEngine` implementation. +- [ ] Audio pipeline: + - [ ] Convert captured audio to **16 kHz mono** 16-bit PCM. + - [ ] Chunking/streaming into STT worker; end-of-dictation triggers transcription. +- [ ] **Model Manager** (backend + minimal UI): + - [ ] Bundle a **curated JSON catalog** (name, size, languages, license, URL, SHA256). + - [ ] Download via `URLSession` with progress + resume support. + - [ ] Validate **SHA256**; store under `~/Library/Application Support/MenuWhisper/Models`. + - [ ] Allow **select active model**; persist selection. + - [ ] Language: **auto** or **forced** (persist). +- [ ] Text normalization pass (basic replacements; punctuation from model). +- [ ] Error handling (network failures, disk full, missing model). +- [ ] Performance knobs (threads, GPU toggle if exposed by backend). + +### AC +- [ ] A **10 s** clip produces coherent **ES/EN** text **offline**. +- [ ] Latency target: **< 4 s** additional for 10 s clip on M1 with **small** model. +- [ ] Memory: ~**1.5–2.5 GB** with small model without leaks. +- [ ] Model download: progress UI + SHA256 verification + selection works. + +--- + +## Phase 3 — Robust Text Insertion (MVP-3) + +**Goal:** Insert text into focused app safely; handle Secure Input. + +### Tasks +- [ ] Implement **Paste** method: + - [ ] Put text on **NSPasteboard** (general). + - [ ] Send **⌘V** via CGEvent to focused app. +- [ ] Implement **Typing** fallback: + - [ ] Generate per-character **CGEvent**; respect active keyboard layout. + - [ ] Handle `\n`, `\t`, and common unicode safely. +- [ ] Detect **Secure Input**: + - [ ] Use `IsSecureEventInputEnabled()` (or accepted API) check before injection. + - [ ] If enabled: **do not inject**; keep text on clipboard; show non-blocking notice. +- [ ] Add preference for **insertion method** (Paste preferred) + fallback strategy. +- [ ] Add **Permissions** helpers for Accessibility/Input Monitoring (deep links). +- [ ] Compatibility tests: Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail. + +### AC +- [ ] Text reliably appears in the currently focused app via Paste. +- [ ] If Paste is blocked, Typing fallback works (except in Secure Input). +- [ ] When **Secure Input** is active: no injection occurs; clipboard contains the text; user is informed. + +--- + +## Phase 4 — Preferences + UX Polish (MVP-4) + +**Goal:** Complete options, localization, and stability. + +### Tasks +- [ ] Full **Preferences** window: + - [ ] Hotkey recorder (change ⌘⇧V if needed). + - [ ] Mode: Push-to-talk / Toggle. + - [ ] Model picker: list, **download**, **delete**, **set active**, show size/language/license. + - [ ] Language: Auto / Forced (dropdown). + - [ ] Insertion: **Direct** (default) vs **Preview**; Paste vs Typing preference. + - [ ] HUD: opacity/size, show/hide sounds toggles. + - [ ] Dictation limit: editable (default 10 min). + - [ ] Advanced: threads/batch; **local logs opt-in**. + - [ ] **Export/Import** settings (JSON). +- [ ] Implement **Preview** dialog (off by default): shows transcribed text with **Insert** / **Cancel**. +- [ ] Expand **localization** (ES/EN) for all UI strings. +- [ ] Onboarding & help views (permissions, Secure Input explanation). +- [ ] Persist all settings in `UserDefaults`; validate on load; migrate if needed. +- [ ] UX polish: icons, animation timing, keyboard navigation, VoiceOver labels. +- [ ] Optional: internal **timing instrumentation** (guarded by logs opt-in). + +### AC +- [ ] All preferences persist and take effect without relaunch. +- [ ] Preview (when enabled) allows quick edit & insertion. +- [ ] ES/EN localization passes a manual spot-check. + +--- + +## Phase 5 — Distribution (MVP-5) + +**Goal:** Shippable, signed/notarized .dmg, user docs. + +### Tasks +- [ ] Hardened runtime, entitlements, Info.plist: + - [ ] `NSMicrophoneUsageDescription` + - [ ] Review for any additional required entitlements. +- [ ] **Code signing** with Developer ID; set team identifiers. +- [ ] **Notarization** using `notarytool`; **staple** on success. +- [ ] Build **.app** and create **.dmg**: + - [ ] DMG background, /Applications symlink, icon. +- [ ] Write **Docs/USER_GUIDE.md** (first run, downloading models, dictation flow). +- [ ] Write **Docs/TROUBLESHOOTING.md** (permissions, Secure Input, model space/RAM issues). +- [ ] QA matrix: + - [ ] macOS **13/14/15**, Apple Silicon **M1/M2/M3**. + - [ ] Target apps list (insertion works). + - [ ] Offline check (network disabled). +- [ ] Prepare **VERSIONING** notes and changelog (semantic-ish). + +### AC +- [ ] Signed & **notarized** .dmg installs cleanly. +- [ ] App functions **entirely offline** post-model download. +- [ ] Guides are complete and reference all common pitfalls. + +--- + +## Phase 6 — Core ML Backend (Post-MVP) + +**Goal:** Second STT backend and selector. + +### Tasks +- [ ] Evaluate **Core ML** path (e.g., WhisperKit or custom Core ML models). +- [ ] Implement `STTEngineCoreML` conforming to `STTEngine` protocol. +- [ ] Backend **selector** in Preferences; runtime switching. +- [ ] Ensure **feature parity** (language settings, output normalization). +- [ ] **Benchmarks**: produce local latency/memory table across small/base/medium. +- [ ] Errors & fallbacks (if model missing, surface helpful guidance). + +### AC +- [ ] Both backends run on Apple Silicon; user can switch backends. +- [ ] Comparable outputs; documented pros/cons and performance data. + +--- + +## Backlog / Post-MVP Options + +- [ ] **VAD (WebRTC)**: auto-stop on silence with thresholds. +- [ ] **Continuous dictation** with smart segmentation. +- [ ] **Noise suppression** and AGC in the audio pipeline. +- [ ] **Login item** (auto-launch at login). +- [ ] **Sparkle** or custom updater (if desirable outside App Store). +- [ ] **Settings profiles** (per-language/model presets). +- [ ] **In-app model catalog refresh** (remote JSON update). +- [ ] **Advanced insertion rules** (per-app behavior). +- [ ] **Analytics viewer** for local logs (no telemetry). + +--- +``` diff --git a/Tests/CoreAudioTests/AudioEngineTests.swift b/Tests/CoreAudioTests/AudioEngineTests.swift new file mode 100644 index 0000000..c91416e --- /dev/null +++ b/Tests/CoreAudioTests/AudioEngineTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import MenuWhisperAudio + +final class AudioEngineTests: XCTestCase { + func testAudioEngineInitialization() { + let engine = AudioEngine() + XCTAssertNotNil(engine) + XCTAssertFalse(engine.isCapturing) + } +} \ No newline at end of file diff --git a/Tests/CoreInjectionTests/TextInjectorTests.swift b/Tests/CoreInjectionTests/TextInjectorTests.swift new file mode 100644 index 0000000..d117cab --- /dev/null +++ b/Tests/CoreInjectionTests/TextInjectorTests.swift @@ -0,0 +1,9 @@ +import XCTest +@testable import CoreInjection + +final class TextInjectorTests: XCTestCase { + func testTextInjectorInitialization() { + let injector = TextInjector() + XCTAssertNotNil(injector) + } +} \ No newline at end of file diff --git a/Tests/CoreModelsTests/ModelManagerTests.swift b/Tests/CoreModelsTests/ModelManagerTests.swift new file mode 100644 index 0000000..fb98010 --- /dev/null +++ b/Tests/CoreModelsTests/ModelManagerTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import CoreModels + +final class ModelManagerTests: XCTestCase { + func testModelManagerInitialization() { + let manager = ModelManager() + XCTAssertNotNil(manager) + XCTAssertEqual(manager.availableModels.count, 0) + } +} \ No newline at end of file diff --git a/Tests/CorePermissionsTests/PermissionManagerTests.swift b/Tests/CorePermissionsTests/PermissionManagerTests.swift new file mode 100644 index 0000000..6d10a0c --- /dev/null +++ b/Tests/CorePermissionsTests/PermissionManagerTests.swift @@ -0,0 +1,9 @@ +import XCTest +@testable import CorePermissions + +final class PermissionManagerTests: XCTestCase { + func testPermissionManagerInitialization() { + let manager = PermissionManager() + XCTAssertNotNil(manager) + } +} \ No newline at end of file diff --git a/Tests/CoreSTTTests/STTEngineTests.swift b/Tests/CoreSTTTests/STTEngineTests.swift new file mode 100644 index 0000000..0694682 --- /dev/null +++ b/Tests/CoreSTTTests/STTEngineTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import CoreSTT + +final class STTEngineTests: XCTestCase { + func testWhisperCPPEngineInitialization() { + let engine = WhisperCPPEngine() + XCTAssertNotNil(engine) + XCTAssertFalse(engine.isModelLoaded()) + } +} \ No newline at end of file diff --git a/Tests/CoreSettingsTests/SettingsTests.swift b/Tests/CoreSettingsTests/SettingsTests.swift new file mode 100644 index 0000000..e34160f --- /dev/null +++ b/Tests/CoreSettingsTests/SettingsTests.swift @@ -0,0 +1,10 @@ +import XCTest +@testable import CoreSettings + +final class SettingsTests: XCTestCase { + func testSettingsInitialization() { + let settings = Settings() + XCTAssertNotNil(settings) + XCTAssertEqual(settings.hotkeyMode, .pushToTalk) + } +} \ No newline at end of file diff --git a/Tests/CoreUtilsTests/LoggerTests.swift b/Tests/CoreUtilsTests/LoggerTests.swift new file mode 100644 index 0000000..10b7422 --- /dev/null +++ b/Tests/CoreUtilsTests/LoggerTests.swift @@ -0,0 +1,17 @@ +import XCTest +@testable import CoreUtils + +final class LoggerTests: XCTestCase { + func testLoggerInitialization() { + let logger = Logger(category: "Test") + logger.info("Test message") + } + + func testLoggerLevels() { + let logger = Logger(category: "Test") + logger.debug("Debug message") + logger.info("Info message") + logger.warning("Warning message") + logger.error("Error message") + } +} \ No newline at end of file