Initial commit
This commit is contained in:
commit
1db16227b2
31 changed files with 2175 additions and 0 deletions
57
.github/workflows/build.yml
vendored
Normal file
57
.github/workflows/build.yml
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
name: Build
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, develop ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
name: Build Menu-Whisper
|
||||||
|
runs-on: macos-13
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Select Xcode version
|
||||||
|
run: sudo xcode-select -s /Applications/Xcode_15.0.app/Contents/Developer
|
||||||
|
|
||||||
|
- name: Show Xcode version
|
||||||
|
run: xcodebuild -version
|
||||||
|
|
||||||
|
- name: Show Swift version
|
||||||
|
run: swift --version
|
||||||
|
|
||||||
|
- name: Cache Swift Package Manager
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: .build
|
||||||
|
key: ${{ runner.os }}-spm-${{ hashFiles('**/Package.resolved') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-spm-
|
||||||
|
|
||||||
|
- name: Build with Swift Package Manager
|
||||||
|
run: swift build -c release
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: swift test
|
||||||
|
|
||||||
|
- name: Check code formatting (SwiftFormat)
|
||||||
|
run: |
|
||||||
|
# Install SwiftFormat if available
|
||||||
|
if command -v swiftformat >/dev/null 2>&1; then
|
||||||
|
swiftformat --lint .
|
||||||
|
else
|
||||||
|
echo "SwiftFormat not available, skipping format check"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run SwiftLint
|
||||||
|
run: |
|
||||||
|
# Install SwiftLint if available
|
||||||
|
if command -v swiftlint >/dev/null 2>&1; then
|
||||||
|
swiftlint
|
||||||
|
else
|
||||||
|
echo "SwiftLint not available, skipping lint check"
|
||||||
|
fi
|
||||||
37
.gitignore
vendored
Normal file
37
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
# Build artifacts
|
||||||
|
.build/
|
||||||
|
build/
|
||||||
|
DerivedData/
|
||||||
|
|
||||||
|
# Swift Package Manager
|
||||||
|
.swiftpm/
|
||||||
|
Package.resolved
|
||||||
|
|
||||||
|
# Xcode
|
||||||
|
*.xcodeproj/
|
||||||
|
*.xcworkspace/
|
||||||
|
xcuserdata/
|
||||||
|
*.xccheckout
|
||||||
|
*.moved-aside
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.tmp
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
41
.swiftformat
Normal file
41
.swiftformat
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
# SwiftFormat Configuration for Menu-Whisper
|
||||||
|
|
||||||
|
# Indentation
|
||||||
|
--indent 4
|
||||||
|
--indentcase false
|
||||||
|
--smarttabs enabled
|
||||||
|
|
||||||
|
# Spacing
|
||||||
|
--spaces 4
|
||||||
|
--trailingspace ignore
|
||||||
|
--semicolons never
|
||||||
|
|
||||||
|
# Line breaks
|
||||||
|
--maxwidth 120
|
||||||
|
--linebreaks lf
|
||||||
|
--wraparguments preserve
|
||||||
|
--wraptypealiases preserve
|
||||||
|
--wrapparameters preserve
|
||||||
|
--wrapcollections preserve
|
||||||
|
|
||||||
|
# Braces
|
||||||
|
--allman false
|
||||||
|
--elseposition same-line
|
||||||
|
|
||||||
|
# Comments
|
||||||
|
--stripunusedargs closure-only
|
||||||
|
|
||||||
|
# Imports
|
||||||
|
--importgrouping testable-bottom
|
||||||
|
|
||||||
|
# Other formatting options
|
||||||
|
--redundanttype inferred
|
||||||
|
--closingparen balanced
|
||||||
|
--commas inline
|
||||||
|
--trimwhitespace always
|
||||||
|
--insertlines enabled
|
||||||
|
--removelines enabled
|
||||||
|
--emptybraces no-space
|
||||||
|
|
||||||
|
# Disable certain rules that conflict with team preferences
|
||||||
|
--disable redundantSelf
|
||||||
92
.swiftlint.yml
Normal file
92
.swiftlint.yml
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
# SwiftLint Configuration for Menu-Whisper
|
||||||
|
|
||||||
|
# Paths to include/exclude
|
||||||
|
included:
|
||||||
|
- Sources
|
||||||
|
- Tests
|
||||||
|
|
||||||
|
excluded:
|
||||||
|
- Pods
|
||||||
|
- .build
|
||||||
|
- DerivedData
|
||||||
|
|
||||||
|
# Rules configuration
|
||||||
|
disabled_rules:
|
||||||
|
- trailing_comma
|
||||||
|
- todo
|
||||||
|
- force_cast
|
||||||
|
- force_try
|
||||||
|
|
||||||
|
opt_in_rules:
|
||||||
|
- empty_count
|
||||||
|
- empty_string
|
||||||
|
- contains_over_first_not_nil
|
||||||
|
- closure_spacing
|
||||||
|
- multiline_function_chains
|
||||||
|
- multiline_literal_brackets
|
||||||
|
- multiline_parameters
|
||||||
|
- operator_usage_whitespace
|
||||||
|
- overridden_super_call
|
||||||
|
- private_outlet
|
||||||
|
- prohibited_super_call
|
||||||
|
- redundant_nil_coalescing
|
||||||
|
- switch_case_alignment
|
||||||
|
- unneeded_parentheses_in_closure_argument
|
||||||
|
- vertical_parameter_alignment_on_call
|
||||||
|
|
||||||
|
# Line length
|
||||||
|
line_length:
|
||||||
|
warning: 120
|
||||||
|
error: 140
|
||||||
|
|
||||||
|
# File length
|
||||||
|
file_length:
|
||||||
|
warning: 500
|
||||||
|
error: 800
|
||||||
|
|
||||||
|
# Function length
|
||||||
|
function_body_length:
|
||||||
|
warning: 50
|
||||||
|
error: 100
|
||||||
|
|
||||||
|
# Type body length
|
||||||
|
type_body_length:
|
||||||
|
warning: 300
|
||||||
|
error: 400
|
||||||
|
|
||||||
|
# Cyclomatic complexity
|
||||||
|
cyclomatic_complexity:
|
||||||
|
warning: 10
|
||||||
|
error: 20
|
||||||
|
|
||||||
|
# Nesting depth
|
||||||
|
nesting:
|
||||||
|
type_level: 3
|
||||||
|
statement_level: 5
|
||||||
|
|
||||||
|
# Large tuple
|
||||||
|
large_tuple:
|
||||||
|
warning: 3
|
||||||
|
error: 4
|
||||||
|
|
||||||
|
# Identifier names
|
||||||
|
identifier_name:
|
||||||
|
min_length:
|
||||||
|
warning: 2
|
||||||
|
excluded:
|
||||||
|
- id
|
||||||
|
- URL
|
||||||
|
- url
|
||||||
|
- x
|
||||||
|
- y
|
||||||
|
- z
|
||||||
|
|
||||||
|
# Custom rules
|
||||||
|
custom_rules:
|
||||||
|
no_print:
|
||||||
|
name: "No Print Statements"
|
||||||
|
regex: "print\\("
|
||||||
|
message: "Use Logger instead of print statements"
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
reporter: "xcode"
|
||||||
243
Docs/ARCHITECTURE.md
Normal file
243
Docs/ARCHITECTURE.md
Normal file
|
|
@ -0,0 +1,243 @@
|
||||||
|
# Architecture — Menu-Whisper
|
||||||
|
|
||||||
|
This document describes the high-level architecture and module organization for Menu-Whisper, a macOS offline speech-to-text application.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Menu-Whisper follows a modular architecture with clear separation of concerns between UI, audio processing, speech recognition, text injection, and system integration components.
|
||||||
|
|
||||||
|
## System Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ App Layer │
|
||||||
|
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ MenuBarExtra │ │ HUD Panel │ │ Preferences │ │
|
||||||
|
│ │ (SwiftUI) │ │ (SwiftUI) │ │ (SwiftUI) │ │
|
||||||
|
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ Core Modules │
|
||||||
|
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ Audio │ │ STT │ │ Injection │ │
|
||||||
|
│ │ AVAudioEngine │ │ whisper.cpp │ │ Clipboard │ │
|
||||||
|
│ │ RMS/Peak │ │ Core ML │ │ Typing │ │
|
||||||
|
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
|
||||||
|
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ Models │ │ Permissions │ │ Settings │ │
|
||||||
|
│ │ Management │ │ Microphone │ │ UserDefaults│ │
|
||||||
|
│ │ Downloads │ │ Accessibility │ │ JSON Export │ │
|
||||||
|
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ System Integration │
|
||||||
|
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ Global Hotkeys │ │ Secure Input │ │ Utils │ │
|
||||||
|
│ │ Carbon │ │ Detection │ │ Helpers │ │
|
||||||
|
│ │ RegisterHotKey │ │ CGEvent API │ │ │ │
|
||||||
|
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Module Descriptions
|
||||||
|
|
||||||
|
### App Layer
|
||||||
|
- **MenuBarExtra**: SwiftUI-based menu bar interface using `MenuBarExtra` for macOS 13+
|
||||||
|
- **HUD Panel**: Non-activating NSPanel for "Listening" and "Processing" states
|
||||||
|
- **Preferences**: Settings window with model management, hotkey configuration, etc.
|
||||||
|
|
||||||
|
### Core Modules
|
||||||
|
|
||||||
|
#### Core/Audio
|
||||||
|
**Purpose**: Audio capture and real-time processing
|
||||||
|
- AVAudioEngine integration for microphone input
|
||||||
|
- Real-time RMS/peak computation for visual feedback
|
||||||
|
- Audio format conversion (16kHz mono PCM for STT)
|
||||||
|
- Dictation time limits and session management
|
||||||
|
|
||||||
|
#### Core/STT
|
||||||
|
**Purpose**: Speech-to-text processing with multiple backends
|
||||||
|
- **WhisperCPP**: Primary backend using whisper.cpp with Metal acceleration
|
||||||
|
- **CoreML**: Future backend for Core ML models (Phase 6)
|
||||||
|
- `STTEngine` protocol for backend abstraction
|
||||||
|
- Language detection and text normalization
|
||||||
|
|
||||||
|
#### Core/Models
|
||||||
|
**Purpose**: Model catalog, downloads, and management
|
||||||
|
- Curated model catalog (JSON-based)
|
||||||
|
- Download management with progress tracking
|
||||||
|
- SHA256 verification and integrity checks
|
||||||
|
- Local storage in `~/Library/Application Support/MenuWhisper/Models`
|
||||||
|
- Model selection and metadata management
|
||||||
|
|
||||||
|
#### Core/Injection
|
||||||
|
**Purpose**: Text insertion into focused applications
|
||||||
|
- Clipboard-based insertion (preferred method)
|
||||||
|
- Character-by-character typing fallback
|
||||||
|
- Secure Input detection and handling
|
||||||
|
- Cross-application compatibility layer
|
||||||
|
|
||||||
|
#### Core/Permissions
|
||||||
|
**Purpose**: System permission management and onboarding
|
||||||
|
- Microphone access (AVAudioSession)
|
||||||
|
- Accessibility permissions for text injection
|
||||||
|
- Input Monitoring permissions for global hotkeys
|
||||||
|
- Permission status checking and guidance flows
|
||||||
|
|
||||||
|
#### Core/Settings
|
||||||
|
**Purpose**: User preferences and configuration persistence
|
||||||
|
- UserDefaults-based storage
|
||||||
|
- JSON export/import functionality
|
||||||
|
- Settings validation and migration
|
||||||
|
- Configuration change notifications
|
||||||
|
|
||||||
|
### System Integration
|
||||||
|
|
||||||
|
#### Global Hotkeys
|
||||||
|
- Carbon framework integration (`RegisterEventHotKey`)
|
||||||
|
- Push-to-talk and toggle modes
|
||||||
|
- Hotkey conflict detection and user guidance
|
||||||
|
- Cross-application hotkey handling
|
||||||
|
|
||||||
|
#### Secure Input Detection
|
||||||
|
- `IsSecureEventInputEnabled()` monitoring
|
||||||
|
- Safe fallback behavior (clipboard-only)
|
||||||
|
- User notification for secure contexts
|
||||||
|
|
||||||
|
#### Utils
|
||||||
|
- Shared utilities and helper functions
|
||||||
|
- Logging infrastructure (opt-in local logs)
|
||||||
|
- Error handling and user feedback
|
||||||
|
|
||||||
|
## Data Flow
|
||||||
|
|
||||||
|
### Main Operational Flow
|
||||||
|
```
|
||||||
|
User Hotkey → Audio Capture → STT Processing → Text Injection
|
||||||
|
▲ │ │ │
|
||||||
|
│ ▼ ▼ ▼
|
||||||
|
Hotkey Mgr Audio Buffer Model Engine Injection Mgr
|
||||||
|
│ RMS/Peak whisper.cpp Clipboard/Type
|
||||||
|
│ │ │ │
|
||||||
|
▼ ▼ ▼ ▼
|
||||||
|
HUD UI Visual Feedback Processing UI Target App
|
||||||
|
```
|
||||||
|
|
||||||
|
### State Management
|
||||||
|
The application follows a finite state machine pattern:
|
||||||
|
- **Idle**: Waiting for user input
|
||||||
|
- **Listening**: Capturing audio with visual feedback
|
||||||
|
- **Processing**: Running STT inference
|
||||||
|
- **Injecting**: Inserting text into target application
|
||||||
|
- **Error**: Handling and displaying errors
|
||||||
|
|
||||||
|
## Finite State Machine
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐
|
||||||
|
│ Idle │◄─────────────┐
|
||||||
|
└─────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ Hotkey Press │ Success/Error
|
||||||
|
▼ │
|
||||||
|
┌─────────────┐ │
|
||||||
|
│ Listening │ │
|
||||||
|
└─────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ Stop/Timeout │
|
||||||
|
▼ │
|
||||||
|
┌─────────────┐ │
|
||||||
|
│ Processing │ │
|
||||||
|
└─────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ STT Complete │
|
||||||
|
▼ │
|
||||||
|
┌─────────────┐ │
|
||||||
|
│ Injecting │──────────────┘
|
||||||
|
└─────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technology Stack
|
||||||
|
|
||||||
|
### Core Technologies
|
||||||
|
- **Swift 5.9+**: Primary development language
|
||||||
|
- **SwiftUI**: User interface framework
|
||||||
|
- **AppKit**: macOS-specific UI components (NSStatusItem, NSPanel)
|
||||||
|
- **AVFoundation**: Audio capture and processing
|
||||||
|
- **Carbon**: Global hotkey registration
|
||||||
|
|
||||||
|
### External Dependencies
|
||||||
|
- **whisper.cpp**: C/C++ speech recognition engine with Metal support
|
||||||
|
- **Swift Package Manager**: Dependency management and build system
|
||||||
|
|
||||||
|
### Platform Integration
|
||||||
|
- **UserDefaults**: Settings persistence
|
||||||
|
- **NSPasteboard**: Clipboard operations
|
||||||
|
- **CGEvent**: Low-level input simulation
|
||||||
|
- **URLSession**: Model downloads
|
||||||
|
|
||||||
|
## Build System
|
||||||
|
|
||||||
|
The project uses Swift Package Manager with modular targets:
|
||||||
|
|
||||||
|
```
|
||||||
|
MenuWhisper/
|
||||||
|
├── Package.swift # SPM configuration
|
||||||
|
├── Sources/
|
||||||
|
│ ├── App/ # Main application target
|
||||||
|
│ ├── CoreAudio/ # Audio processing module
|
||||||
|
│ ├── CoreSTT/ # Speech-to-text engines
|
||||||
|
│ ├── CoreModels/ # Model management
|
||||||
|
│ ├── CoreInjection/ # Text insertion
|
||||||
|
│ ├── CorePermissions/ # System permissions
|
||||||
|
│ ├── CoreSettings/ # User preferences
|
||||||
|
│ └── CoreUtils/ # Shared utilities
|
||||||
|
├── Resources/ # Assets, localizations
|
||||||
|
└── Tests/ # Unit and integration tests
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### Privacy
|
||||||
|
- All audio processing occurs locally
|
||||||
|
- No telemetry or data collection
|
||||||
|
- Optional local logging with user consent
|
||||||
|
|
||||||
|
### System Security
|
||||||
|
- Respects Secure Input contexts
|
||||||
|
- Requires explicit user permission grants
|
||||||
|
- Code signing and notarization for distribution
|
||||||
|
|
||||||
|
### Input Safety
|
||||||
|
- Validates all user inputs
|
||||||
|
- Safe handling of special characters in typing mode
|
||||||
|
- Proper escaping for different keyboard layouts
|
||||||
|
|
||||||
|
## Performance Characteristics
|
||||||
|
|
||||||
|
### Target Metrics
|
||||||
|
- **Latency**: <4s additional processing time for 10s audio (M1 + small model)
|
||||||
|
- **Memory**: ~1.5-2.5GB with small model
|
||||||
|
- **Model Loading**: Lazy loading with warm cache
|
||||||
|
- **UI Responsiveness**: Non-blocking background processing
|
||||||
|
|
||||||
|
### Optimization Strategies
|
||||||
|
- Metal acceleration for STT inference
|
||||||
|
- Efficient audio buffering and streaming
|
||||||
|
- Model reuse across dictation sessions
|
||||||
|
- Configurable threading for CPU-intensive operations
|
||||||
|
|
||||||
|
## Future Extensibility
|
||||||
|
|
||||||
|
The modular architecture supports future enhancements:
|
||||||
|
- Additional STT backends (Core ML, cloud services)
|
||||||
|
- Voice Activity Detection (VAD)
|
||||||
|
- Advanced audio preprocessing
|
||||||
|
- Custom insertion rules per application
|
||||||
|
- Plugin architecture for text processing
|
||||||
|
|
||||||
|
This architecture provides a solid foundation for the MVP while maintaining flexibility for future feature additions and platform evolution.
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 Menu-Whisper
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
123
Package.swift
Normal file
123
Package.swift
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
// swift-tools-version: 5.9
|
||||||
|
import PackageDescription
|
||||||
|
|
||||||
|
let package = Package(
|
||||||
|
name: "MenuWhisper",
|
||||||
|
platforms: [
|
||||||
|
.macOS(.v13)
|
||||||
|
],
|
||||||
|
products: [
|
||||||
|
.executable(
|
||||||
|
name: "MenuWhisper",
|
||||||
|
targets: ["App"]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
dependencies: [
|
||||||
|
// Add external dependencies here as needed
|
||||||
|
// Example: .package(url: "...", from: "1.0.0")
|
||||||
|
],
|
||||||
|
targets: [
|
||||||
|
// Main Application Target
|
||||||
|
.executableTarget(
|
||||||
|
name: "App",
|
||||||
|
dependencies: [
|
||||||
|
"MenuWhisperAudio",
|
||||||
|
"CoreSTT",
|
||||||
|
"CoreModels",
|
||||||
|
"CoreInjection",
|
||||||
|
"CorePermissions",
|
||||||
|
"CoreSettings",
|
||||||
|
"CoreUtils"
|
||||||
|
],
|
||||||
|
path: "Sources/App",
|
||||||
|
resources: [
|
||||||
|
.copy("../../Resources")
|
||||||
|
]
|
||||||
|
),
|
||||||
|
|
||||||
|
// Core Module Targets
|
||||||
|
.target(
|
||||||
|
name: "MenuWhisperAudio",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Sources/CoreAudio"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CoreSTT",
|
||||||
|
dependencies: ["CoreUtils", "CoreModels", "MenuWhisperAudio"],
|
||||||
|
path: "Sources/CoreSTT"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CoreModels",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Sources/CoreModels"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CoreInjection",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Sources/CoreInjection"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CorePermissions",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Sources/CorePermissions"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CoreSettings",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Sources/CoreSettings"
|
||||||
|
),
|
||||||
|
|
||||||
|
.target(
|
||||||
|
name: "CoreUtils",
|
||||||
|
path: "Sources/CoreUtils"
|
||||||
|
),
|
||||||
|
|
||||||
|
// Test Targets
|
||||||
|
.testTarget(
|
||||||
|
name: "MenuWhisperAudioTests",
|
||||||
|
dependencies: ["MenuWhisperAudio"],
|
||||||
|
path: "Tests/CoreAudioTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CoreSTTTests",
|
||||||
|
dependencies: ["CoreSTT"],
|
||||||
|
path: "Tests/CoreSTTTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CoreModelsTests",
|
||||||
|
dependencies: ["CoreModels"],
|
||||||
|
path: "Tests/CoreModelsTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CoreInjectionTests",
|
||||||
|
dependencies: ["CoreInjection"],
|
||||||
|
path: "Tests/CoreInjectionTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CorePermissionsTests",
|
||||||
|
dependencies: ["CorePermissions"],
|
||||||
|
path: "Tests/CorePermissionsTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CoreSettingsTests",
|
||||||
|
dependencies: ["CoreSettings"],
|
||||||
|
path: "Tests/CoreSettingsTests"
|
||||||
|
),
|
||||||
|
|
||||||
|
.testTarget(
|
||||||
|
name: "CoreUtilsTests",
|
||||||
|
dependencies: ["CoreUtils"],
|
||||||
|
path: "Tests/CoreUtilsTests"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
96
README.md
Normal file
96
README.md
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# Menu-Whisper
|
||||||
|
|
||||||
|
A macOS menu bar application that provides offline speech-to-text transcription using Whisper-family models and automatically inserts the transcribed text into the currently focused application.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Menu-Whisper is designed to be a privacy-focused, offline-first speech recognition tool for macOS. It runs entirely locally on Apple Silicon machines, requiring no internet connection during normal operation (only for initial model downloads).
|
||||||
|
|
||||||
|
### Key Features
|
||||||
|
|
||||||
|
- **100% Offline Operation**: Audio and text never leave your device
|
||||||
|
- **Apple Silicon Optimized**: Built specifically for M1/M2/M3 processors with Metal acceleration
|
||||||
|
- **Global Hotkey Support**: Default ⌘⇧V (configurable)
|
||||||
|
- **Smart Text Insertion**: Clipboard paste with typing fallback
|
||||||
|
- **Secure Input Detection**: Respects password fields and secure contexts
|
||||||
|
- **Multiple Models**: Support for various Whisper model sizes and variants
|
||||||
|
- **Multilingual**: Spanish and English interface and recognition
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- **macOS**: 13.0 (Ventura) or later
|
||||||
|
- **Hardware**: Apple Silicon (M1, M2, or M3 processor) - Intel Macs are not supported
|
||||||
|
- **Xcode**: 15.0+ for building from source
|
||||||
|
- **Permissions**: Microphone, Accessibility, and Input Monitoring access
|
||||||
|
|
||||||
|
## Build Requirements
|
||||||
|
|
||||||
|
### Development Environment
|
||||||
|
- macOS 13+ with Xcode 15.0+
|
||||||
|
- Swift 5.9+
|
||||||
|
- Swift Package Manager (included with Xcode)
|
||||||
|
|
||||||
|
### System Dependencies
|
||||||
|
- AVFoundation framework (audio capture)
|
||||||
|
- Carbon framework (global hotkeys)
|
||||||
|
- AppKit/SwiftUI (UI components)
|
||||||
|
|
||||||
|
### Third-party Dependencies
|
||||||
|
- whisper.cpp (C/C++ library for speech recognition with Metal support)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
**Note**: This project is currently in development. Pre-built binaries will be available as signed and notarized .dmg files once complete.
|
||||||
|
|
||||||
|
### Building from Source
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
cd tellme
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Open the project in Xcode or use Swift Package Manager:
|
||||||
|
```bash
|
||||||
|
swift build -c release
|
||||||
|
```
|
||||||
|
|
||||||
|
3. For development, open `Package.swift` in Xcode.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The application is structured with modular components:
|
||||||
|
- **App**: SwiftUI interface with AppKit bridges
|
||||||
|
- **Core/Audio**: AVAudioEngine capture and processing
|
||||||
|
- **Core/STT**: Speech-to-text engines (whisper.cpp, future Core ML)
|
||||||
|
- **Core/Models**: Model management and downloads
|
||||||
|
- **Core/Injection**: Text insertion with secure input handling
|
||||||
|
- **Core/Permissions**: System permission management
|
||||||
|
- **Core/Settings**: User preferences and configuration
|
||||||
|
|
||||||
|
## Privacy & Security
|
||||||
|
|
||||||
|
- **No Telemetry**: Zero data collection or remote analytics
|
||||||
|
- **Local Processing**: All audio processing happens on-device
|
||||||
|
- **Secure Input Respect**: Automatically detects and respects secure input contexts
|
||||||
|
- **Permission-Based**: Requires explicit user consent for system access
|
||||||
|
|
||||||
|
## Development Status
|
||||||
|
|
||||||
|
This project is currently in active development following a phased approach:
|
||||||
|
- Phase 0: Project scaffolding ⬅️ **Current**
|
||||||
|
- Phase 1: Hotkey + HUD + Audio capture
|
||||||
|
- Phase 2: STT integration with whisper.cpp
|
||||||
|
- Phase 3: Text insertion system
|
||||||
|
- Phase 4: Preferences and UX polish
|
||||||
|
- Phase 5: Distribution and packaging
|
||||||
|
|
||||||
|
See `TODO.md` for detailed development progress and `TECHSPEC.md` for complete technical specifications.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License - see [LICENSE](LICENSE) for details.
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
This project follows a structured development approach with clear phases and acceptance criteria. Please refer to the technical specification and TODO list before contributing.
|
||||||
77
Resources/Localizations/en.lproj/Localizable.strings
Normal file
77
Resources/Localizations/en.lproj/Localizable.strings
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
/* Menu-Whisper - English Localization */
|
||||||
|
|
||||||
|
/* General */
|
||||||
|
"app.name" = "Menu-Whisper";
|
||||||
|
"general.ok" = "OK";
|
||||||
|
"general.cancel" = "Cancel";
|
||||||
|
"general.continue" = "Continue";
|
||||||
|
"general.settings" = "Settings";
|
||||||
|
"general.quit" = "Quit";
|
||||||
|
|
||||||
|
/* Menu Bar */
|
||||||
|
"menubar.idle" = "Idle";
|
||||||
|
"menubar.listening" = "Listening";
|
||||||
|
"menubar.processing" = "Processing";
|
||||||
|
"menubar.preferences" = "Preferences...";
|
||||||
|
"menubar.quit" = "Quit Menu-Whisper";
|
||||||
|
|
||||||
|
/* HUD States */
|
||||||
|
"hud.listening" = "Listening...";
|
||||||
|
"hud.processing" = "Transcribing...";
|
||||||
|
"hud.cancel" = "Press Esc to cancel";
|
||||||
|
|
||||||
|
/* Permissions */
|
||||||
|
"permissions.microphone.title" = "Microphone Access Required";
|
||||||
|
"permissions.microphone.message" = "Menu-Whisper needs access to your microphone to perform speech-to-text transcription.";
|
||||||
|
"permissions.accessibility.title" = "Accessibility Access Required";
|
||||||
|
"permissions.accessibility.message" = "Menu-Whisper needs Accessibility access to insert transcribed text into applications.";
|
||||||
|
"permissions.input_monitoring.title" = "Input Monitoring Required";
|
||||||
|
"permissions.input_monitoring.message" = "Menu-Whisper needs Input Monitoring access to register global hotkeys.";
|
||||||
|
"permissions.open_settings" = "Open System Settings";
|
||||||
|
|
||||||
|
/* Preferences Window */
|
||||||
|
"preferences.title" = "Menu-Whisper Preferences";
|
||||||
|
"preferences.general" = "General";
|
||||||
|
"preferences.models" = "Models";
|
||||||
|
"preferences.hotkeys" = "Hotkeys";
|
||||||
|
"preferences.insertion" = "Text Insertion";
|
||||||
|
"preferences.advanced" = "Advanced";
|
||||||
|
|
||||||
|
/* General Preferences */
|
||||||
|
"preferences.general.hotkey" = "Global Hotkey:";
|
||||||
|
"preferences.general.mode" = "Activation Mode:";
|
||||||
|
"preferences.general.mode.push_to_talk" = "Push-to-talk";
|
||||||
|
"preferences.general.mode.toggle" = "Toggle";
|
||||||
|
"preferences.general.sounds" = "Play sounds for start/stop";
|
||||||
|
"preferences.general.limit" = "Dictation time limit (minutes):";
|
||||||
|
|
||||||
|
/* Model Preferences */
|
||||||
|
"preferences.models.title" = "Speech Recognition Models";
|
||||||
|
"preferences.models.active" = "Active Model:";
|
||||||
|
"preferences.models.language" = "Language:";
|
||||||
|
"preferences.models.language.auto" = "Auto-detect";
|
||||||
|
"preferences.models.download" = "Download";
|
||||||
|
"preferences.models.delete" = "Delete";
|
||||||
|
"preferences.models.size" = "Size:";
|
||||||
|
"preferences.models.languages" = "Languages:";
|
||||||
|
|
||||||
|
/* Insertion Preferences */
|
||||||
|
"preferences.insertion.method" = "Insertion Method:";
|
||||||
|
"preferences.insertion.method.paste" = "Paste (⌘V)";
|
||||||
|
"preferences.insertion.method.type" = "Type characters";
|
||||||
|
"preferences.insertion.preview" = "Show preview before inserting";
|
||||||
|
"preferences.insertion.secure_input" = "Secure Input Detected";
|
||||||
|
"preferences.insertion.secure_input.message" = "Text insertion is disabled in secure contexts. Text has been copied to clipboard.";
|
||||||
|
|
||||||
|
/* Errors */
|
||||||
|
"error.audio.failed" = "Failed to access microphone";
|
||||||
|
"error.model.not_found" = "Speech recognition model not found";
|
||||||
|
"error.model.load_failed" = "Failed to load speech recognition model";
|
||||||
|
"error.transcription.failed" = "Speech transcription failed";
|
||||||
|
"error.download.failed" = "Model download failed";
|
||||||
|
"error.download.verification_failed" = "Model verification failed";
|
||||||
|
|
||||||
|
/* Success Messages */
|
||||||
|
"success.model.downloaded" = "Model downloaded successfully";
|
||||||
|
"success.settings.exported" = "Settings exported successfully";
|
||||||
|
"success.settings.imported" = "Settings imported successfully";
|
||||||
77
Resources/Localizations/es.lproj/Localizable.strings
Normal file
77
Resources/Localizations/es.lproj/Localizable.strings
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
/* Menu-Whisper - Spanish Localization */
|
||||||
|
|
||||||
|
/* General */
|
||||||
|
"app.name" = "Menu-Whisper";
|
||||||
|
"general.ok" = "Aceptar";
|
||||||
|
"general.cancel" = "Cancelar";
|
||||||
|
"general.continue" = "Continuar";
|
||||||
|
"general.settings" = "Configuración";
|
||||||
|
"general.quit" = "Salir";
|
||||||
|
|
||||||
|
/* Menu Bar */
|
||||||
|
"menubar.idle" = "Inactivo";
|
||||||
|
"menubar.listening" = "Escuchando";
|
||||||
|
"menubar.processing" = "Procesando";
|
||||||
|
"menubar.preferences" = "Preferencias...";
|
||||||
|
"menubar.quit" = "Salir de Menu-Whisper";
|
||||||
|
|
||||||
|
/* HUD States */
|
||||||
|
"hud.listening" = "Escuchando...";
|
||||||
|
"hud.processing" = "Transcribiendo...";
|
||||||
|
"hud.cancel" = "Presiona Esc para cancelar";
|
||||||
|
|
||||||
|
/* Permissions */
|
||||||
|
"permissions.microphone.title" = "Acceso al Micrófono Requerido";
|
||||||
|
"permissions.microphone.message" = "Menu-Whisper necesita acceso a tu micrófono para realizar la transcripción de voz a texto.";
|
||||||
|
"permissions.accessibility.title" = "Acceso de Accesibilidad Requerido";
|
||||||
|
"permissions.accessibility.message" = "Menu-Whisper necesita acceso de Accesibilidad para insertar texto transcrito en aplicaciones.";
|
||||||
|
"permissions.input_monitoring.title" = "Monitoreo de Entrada Requerido";
|
||||||
|
"permissions.input_monitoring.message" = "Menu-Whisper necesita acceso de Monitoreo de Entrada para registrar atajos de teclado globales.";
|
||||||
|
"permissions.open_settings" = "Abrir Configuración del Sistema";
|
||||||
|
|
||||||
|
/* Preferences Window */
|
||||||
|
"preferences.title" = "Preferencias de Menu-Whisper";
|
||||||
|
"preferences.general" = "General";
|
||||||
|
"preferences.models" = "Modelos";
|
||||||
|
"preferences.hotkeys" = "Atajos";
|
||||||
|
"preferences.insertion" = "Inserción de Texto";
|
||||||
|
"preferences.advanced" = "Avanzado";
|
||||||
|
|
||||||
|
/* General Preferences */
|
||||||
|
"preferences.general.hotkey" = "Atajo Global:";
|
||||||
|
"preferences.general.mode" = "Modo de Activación:";
|
||||||
|
"preferences.general.mode.push_to_talk" = "Presionar para hablar";
|
||||||
|
"preferences.general.mode.toggle" = "Alternar";
|
||||||
|
"preferences.general.sounds" = "Reproducir sonidos al iniciar/detener";
|
||||||
|
"preferences.general.limit" = "Límite de tiempo de dictado (minutos):";
|
||||||
|
|
||||||
|
/* Model Preferences */
|
||||||
|
"preferences.models.title" = "Modelos de Reconocimiento de Voz";
|
||||||
|
"preferences.models.active" = "Modelo Activo:";
|
||||||
|
"preferences.models.language" = "Idioma:";
|
||||||
|
"preferences.models.language.auto" = "Detección automática";
|
||||||
|
"preferences.models.download" = "Descargar";
|
||||||
|
"preferences.models.delete" = "Eliminar";
|
||||||
|
"preferences.models.size" = "Tamaño:";
|
||||||
|
"preferences.models.languages" = "Idiomas:";
|
||||||
|
|
||||||
|
/* Insertion Preferences */
|
||||||
|
"preferences.insertion.method" = "Método de Inserción:";
|
||||||
|
"preferences.insertion.method.paste" = "Pegar (⌘V)";
|
||||||
|
"preferences.insertion.method.type" = "Escribir caracteres";
|
||||||
|
"preferences.insertion.preview" = "Mostrar vista previa antes de insertar";
|
||||||
|
"preferences.insertion.secure_input" = "Entrada Segura Detectada";
|
||||||
|
"preferences.insertion.secure_input.message" = "La inserción de texto está deshabilitada en contextos seguros. El texto se ha copiado al portapapeles.";
|
||||||
|
|
||||||
|
/* Errors */
|
||||||
|
"error.audio.failed" = "Error al acceder al micrófono";
|
||||||
|
"error.model.not_found" = "Modelo de reconocimiento de voz no encontrado";
|
||||||
|
"error.model.load_failed" = "Error al cargar el modelo de reconocimiento de voz";
|
||||||
|
"error.transcription.failed" = "Error en la transcripción de voz";
|
||||||
|
"error.download.failed" = "Error en la descarga del modelo";
|
||||||
|
"error.download.verification_failed" = "Error en la verificación del modelo";
|
||||||
|
|
||||||
|
/* Success Messages */
|
||||||
|
"success.model.downloaded" = "Modelo descargado exitosamente";
|
||||||
|
"success.settings.exported" = "Configuración exportada exitosamente";
|
||||||
|
"success.settings.imported" = "Configuración importada exitosamente";
|
||||||
38
Scripts/build.sh
Executable file
38
Scripts/build.sh
Executable file
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Build script for Menu-Whisper
|
||||||
|
# This script builds the project using Swift Package Manager
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "🔨 Building Menu-Whisper..."
|
||||||
|
|
||||||
|
# Clean previous build
|
||||||
|
echo "🧹 Cleaning previous build..."
|
||||||
|
swift package clean
|
||||||
|
|
||||||
|
# Build in release mode
|
||||||
|
echo "⚡ Building in release mode..."
|
||||||
|
swift build -c release
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
echo "🧪 Running tests..."
|
||||||
|
swift test
|
||||||
|
|
||||||
|
# Check if SwiftFormat is available and run it
|
||||||
|
if command -v swiftformat >/dev/null 2>&1; then
|
||||||
|
echo "📝 Checking code formatting..."
|
||||||
|
swiftformat --lint .
|
||||||
|
else
|
||||||
|
echo "⚠️ SwiftFormat not available, skipping format check"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if SwiftLint is available and run it
|
||||||
|
if command -v swiftlint >/dev/null 2>&1; then
|
||||||
|
echo "🔍 Running SwiftLint..."
|
||||||
|
swiftlint
|
||||||
|
else
|
||||||
|
echo "⚠️ SwiftLint not available, skipping lint check"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✅ Build completed successfully!"
|
||||||
35
Scripts/notarize.sh
Executable file
35
Scripts/notarize.sh
Executable file
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Notarization script for Menu-Whisper
|
||||||
|
# This is a placeholder script that will be completed in Phase 5
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "🍎 Menu-Whisper Notarization Script"
|
||||||
|
echo "📋 This script will handle code signing and notarization for distribution"
|
||||||
|
echo ""
|
||||||
|
echo "⚠️ This is a placeholder script - implementation pending Phase 5"
|
||||||
|
echo ""
|
||||||
|
echo "📝 Steps that will be implemented:"
|
||||||
|
echo " 1. Code signing with Developer ID"
|
||||||
|
echo " 2. Creating .app bundle"
|
||||||
|
echo " 3. Notarization with Apple"
|
||||||
|
echo " 4. Stapling notarization ticket"
|
||||||
|
echo " 5. Creating .dmg for distribution"
|
||||||
|
echo ""
|
||||||
|
echo "🔧 Usage (when implemented):"
|
||||||
|
echo " ./Scripts/notarize.sh [--developer-id YOUR_TEAM_ID]"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Placeholder for future implementation
|
||||||
|
# TODO: Implement in Phase 5
|
||||||
|
# - Set up code signing identity
|
||||||
|
# - Configure entitlements
|
||||||
|
# - Build .app bundle
|
||||||
|
# - Submit for notarization
|
||||||
|
# - Wait for approval
|
||||||
|
# - Staple ticket
|
||||||
|
# - Create DMG
|
||||||
|
|
||||||
|
echo "❌ Not implemented yet - use in Phase 5"
|
||||||
|
exit 1
|
||||||
18
Sources/App/main.swift
Normal file
18
Sources/App/main.swift
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
import SwiftUI
|
||||||
|
|
||||||
|
@main
|
||||||
|
struct MenuWhisperApp: App {
|
||||||
|
var body: some Scene {
|
||||||
|
MenuBarExtra("Menu-Whisper", systemImage: "mic") {
|
||||||
|
Text("Menu-Whisper")
|
||||||
|
Text("Idle")
|
||||||
|
Divider()
|
||||||
|
Button("Preferences...") {
|
||||||
|
// TODO: Open preferences
|
||||||
|
}
|
||||||
|
Button("Quit") {
|
||||||
|
NSApplication.shared.terminate(nil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
42
Sources/CoreAudio/AudioEngine.swift
Normal file
42
Sources/CoreAudio/AudioEngine.swift
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public protocol AudioEngineDelegate: AnyObject {
|
||||||
|
func audioEngine(_ engine: AudioEngine, didUpdateLevel level: Float)
|
||||||
|
func audioEngine(_ engine: AudioEngine, didCaptureAudio data: Data)
|
||||||
|
func audioEngineDidStartCapture(_ engine: AudioEngine)
|
||||||
|
func audioEngineDidStopCapture(_ engine: AudioEngine)
|
||||||
|
}
|
||||||
|
|
||||||
|
public class AudioEngine: ObservableObject {
|
||||||
|
private let logger = Logger(category: "AudioEngine")
|
||||||
|
private let audioEngine = AVAudioEngine()
|
||||||
|
|
||||||
|
public weak var delegate: AudioEngineDelegate?
|
||||||
|
|
||||||
|
@Published public private(set) var isCapturing = false
|
||||||
|
@Published public private(set) var currentLevel: Float = 0.0
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
// Audio engine initialization will be completed in Phase 1
|
||||||
|
}
|
||||||
|
|
||||||
|
public func startCapture() throws {
|
||||||
|
logger.info("Starting audio capture")
|
||||||
|
// TODO: Implement in Phase 1
|
||||||
|
isCapturing = true
|
||||||
|
delegate?.audioEngineDidStartCapture(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func stopCapture() {
|
||||||
|
logger.info("Stopping audio capture")
|
||||||
|
// TODO: Implement in Phase 1
|
||||||
|
isCapturing = false
|
||||||
|
delegate?.audioEngineDidStopCapture(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) {
|
||||||
|
// TODO: Implement RMS calculation and audio processing in Phase 1
|
||||||
|
}
|
||||||
|
}
|
||||||
73
Sources/CoreInjection/TextInjector.swift
Normal file
73
Sources/CoreInjection/TextInjector.swift
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
import Foundation
|
||||||
|
import AppKit
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public enum InjectionMethod {
|
||||||
|
case paste
|
||||||
|
case typing
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum InjectionError: Error, LocalizedError {
|
||||||
|
case secureInputActive
|
||||||
|
case accessibilityPermissionRequired
|
||||||
|
case injectionFailed(String)
|
||||||
|
|
||||||
|
public var errorDescription: String? {
|
||||||
|
switch self {
|
||||||
|
case .secureInputActive:
|
||||||
|
return NSLocalizedString("preferences.insertion.secure_input.message", comment: "Secure input message")
|
||||||
|
case .accessibilityPermissionRequired:
|
||||||
|
return NSLocalizedString("permissions.accessibility.message", comment: "Accessibility permission message")
|
||||||
|
case .injectionFailed(let reason):
|
||||||
|
return "Text injection failed: \(reason)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class TextInjector {
|
||||||
|
private let logger = Logger(category: "TextInjector")
|
||||||
|
|
||||||
|
public init() {}
|
||||||
|
|
||||||
|
public func injectText(_ text: String, method: InjectionMethod = .paste) throws {
|
||||||
|
logger.info("Injecting text using method: \(method)")
|
||||||
|
|
||||||
|
// Check for secure input first
|
||||||
|
if isSecureInputActive() {
|
||||||
|
// Copy to clipboard but don't inject
|
||||||
|
copyToClipboard(text)
|
||||||
|
throw InjectionError.secureInputActive
|
||||||
|
}
|
||||||
|
|
||||||
|
switch method {
|
||||||
|
case .paste:
|
||||||
|
try injectViaPaste(text)
|
||||||
|
case .typing:
|
||||||
|
try injectViaTyping(text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func injectViaPaste(_ text: String) throws {
|
||||||
|
logger.debug("Injecting text via paste method")
|
||||||
|
// TODO: Implement paste injection (clipboard + ⌘V) in Phase 3
|
||||||
|
copyToClipboard(text)
|
||||||
|
// TODO: Send ⌘V via CGEvent
|
||||||
|
}
|
||||||
|
|
||||||
|
private func injectViaTyping(_ text: String) throws {
|
||||||
|
logger.debug("Injecting text via typing method")
|
||||||
|
// TODO: Implement character-by-character typing via CGEvent in Phase 3
|
||||||
|
}
|
||||||
|
|
||||||
|
private func copyToClipboard(_ text: String) {
|
||||||
|
let pasteboard = NSPasteboard.general
|
||||||
|
pasteboard.clearContents()
|
||||||
|
pasteboard.setString(text, forType: .string)
|
||||||
|
logger.debug("Text copied to clipboard")
|
||||||
|
}
|
||||||
|
|
||||||
|
private func isSecureInputActive() -> Bool {
|
||||||
|
// TODO: Implement IsSecureEventInputEnabled() check in Phase 3
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
70
Sources/CoreModels/ModelManager.swift
Normal file
70
Sources/CoreModels/ModelManager.swift
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
import Foundation
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public struct ModelInfo: Codable, Identifiable {
|
||||||
|
public let id = UUID()
|
||||||
|
public let name: String
|
||||||
|
public let family: String
|
||||||
|
public let format: String
|
||||||
|
public let sizeMB: Int
|
||||||
|
public let languages: [String]
|
||||||
|
public let recommendedBackend: String
|
||||||
|
public let qualityTier: String
|
||||||
|
public let license: String
|
||||||
|
public let sha256: String
|
||||||
|
public let downloadURL: String
|
||||||
|
public let notes: String
|
||||||
|
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case name, family, format, languages, license, sha256, notes
|
||||||
|
case sizeMB = "size_mb"
|
||||||
|
case recommendedBackend = "recommended_backend"
|
||||||
|
case qualityTier = "quality_tier"
|
||||||
|
case downloadURL = "download_url"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ModelManager: ObservableObject {
|
||||||
|
private let logger = Logger(category: "ModelManager")
|
||||||
|
|
||||||
|
@Published public private(set) var availableModels: [ModelInfo] = []
|
||||||
|
@Published public private(set) var downloadedModels: [ModelInfo] = []
|
||||||
|
@Published public private(set) var activeModel: ModelInfo?
|
||||||
|
|
||||||
|
private let modelsDirectory: URL
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first!
|
||||||
|
modelsDirectory = appSupport.appendingPathComponent("MenuWhisper/Models")
|
||||||
|
|
||||||
|
try? FileManager.default.createDirectory(at: modelsDirectory, withIntermediateDirectories: true)
|
||||||
|
loadModelCatalog()
|
||||||
|
refreshDownloadedModels()
|
||||||
|
}
|
||||||
|
|
||||||
|
public func downloadModel(_ model: ModelInfo) async throws {
|
||||||
|
logger.info("Starting download for model: \(model.name)")
|
||||||
|
// TODO: Implement model download with progress tracking and SHA256 verification in Phase 2
|
||||||
|
}
|
||||||
|
|
||||||
|
public func deleteModel(_ model: ModelInfo) throws {
|
||||||
|
logger.info("Deleting model: \(model.name)")
|
||||||
|
// TODO: Implement model deletion in Phase 2
|
||||||
|
}
|
||||||
|
|
||||||
|
public func setActiveModel(_ model: ModelInfo) {
|
||||||
|
logger.info("Setting active model: \(model.name)")
|
||||||
|
activeModel = model
|
||||||
|
// TODO: Persist active model selection in Phase 2
|
||||||
|
}
|
||||||
|
|
||||||
|
private func loadModelCatalog() {
|
||||||
|
// TODO: Load curated model catalog from bundled JSON in Phase 2
|
||||||
|
logger.info("Loading model catalog")
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshDownloadedModels() {
|
||||||
|
// TODO: Scan models directory and populate downloadedModels in Phase 2
|
||||||
|
logger.info("Refreshing downloaded models")
|
||||||
|
}
|
||||||
|
}
|
||||||
111
Sources/CorePermissions/PermissionManager.swift
Normal file
111
Sources/CorePermissions/PermissionManager.swift
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
import AppKit
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public enum PermissionType: CaseIterable {
|
||||||
|
case microphone
|
||||||
|
case accessibility
|
||||||
|
case inputMonitoring
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum PermissionStatus {
|
||||||
|
case notDetermined
|
||||||
|
case granted
|
||||||
|
case denied
|
||||||
|
case restricted
|
||||||
|
}
|
||||||
|
|
||||||
|
public class PermissionManager: ObservableObject {
|
||||||
|
private let logger = Logger(category: "PermissionManager")
|
||||||
|
|
||||||
|
@Published public private(set) var microphoneStatus: PermissionStatus = .notDetermined
|
||||||
|
@Published public private(set) var accessibilityStatus: PermissionStatus = .notDetermined
|
||||||
|
@Published public private(set) var inputMonitoringStatus: PermissionStatus = .notDetermined
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
refreshAllPermissions()
|
||||||
|
}
|
||||||
|
|
||||||
|
public func requestMicrophonePermission() async -> PermissionStatus {
|
||||||
|
logger.info("Requesting microphone permission")
|
||||||
|
|
||||||
|
return await withCheckedContinuation { continuation in
|
||||||
|
switch AVCaptureDevice.authorizationStatus(for: .audio) {
|
||||||
|
case .authorized:
|
||||||
|
continuation.resume(returning: .granted)
|
||||||
|
case .denied, .restricted:
|
||||||
|
continuation.resume(returning: .denied)
|
||||||
|
case .notDetermined:
|
||||||
|
AVCaptureDevice.requestAccess(for: .audio) { granted in
|
||||||
|
let status: PermissionStatus = granted ? .granted : .denied
|
||||||
|
Task { @MainActor in
|
||||||
|
self.microphoneStatus = status
|
||||||
|
}
|
||||||
|
continuation.resume(returning: status)
|
||||||
|
}
|
||||||
|
@unknown default:
|
||||||
|
continuation.resume(returning: .notDetermined)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public func requestAccessibilityPermission() {
|
||||||
|
logger.info("Requesting accessibility permission")
|
||||||
|
// TODO: Implement accessibility permission request in Phase 1
|
||||||
|
// This typically involves guiding the user to System Settings
|
||||||
|
}
|
||||||
|
|
||||||
|
public func requestInputMonitoringPermission() {
|
||||||
|
logger.info("Requesting input monitoring permission")
|
||||||
|
// TODO: Implement input monitoring permission request in Phase 1
|
||||||
|
// This typically involves guiding the user to System Settings
|
||||||
|
}
|
||||||
|
|
||||||
|
public func openSystemSettings(for permission: PermissionType) {
|
||||||
|
logger.info("Opening system settings for permission: \(permission)")
|
||||||
|
|
||||||
|
let urlString: String
|
||||||
|
switch permission {
|
||||||
|
case .microphone:
|
||||||
|
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Microphone"
|
||||||
|
case .accessibility:
|
||||||
|
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility"
|
||||||
|
case .inputMonitoring:
|
||||||
|
urlString = "x-apple.systempreferences:com.apple.preference.security?Privacy_ListenEvent"
|
||||||
|
}
|
||||||
|
|
||||||
|
if let url = URL(string: urlString) {
|
||||||
|
NSWorkspace.shared.open(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshAllPermissions() {
|
||||||
|
refreshMicrophonePermission()
|
||||||
|
refreshAccessibilityPermission()
|
||||||
|
refreshInputMonitoringPermission()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshMicrophonePermission() {
|
||||||
|
switch AVCaptureDevice.authorizationStatus(for: .audio) {
|
||||||
|
case .notDetermined:
|
||||||
|
microphoneStatus = .notDetermined
|
||||||
|
case .authorized:
|
||||||
|
microphoneStatus = .granted
|
||||||
|
case .denied, .restricted:
|
||||||
|
microphoneStatus = .denied
|
||||||
|
@unknown default:
|
||||||
|
microphoneStatus = .notDetermined
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshAccessibilityPermission() {
|
||||||
|
// TODO: Implement accessibility permission check in Phase 1
|
||||||
|
accessibilityStatus = .notDetermined
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshInputMonitoringPermission() {
|
||||||
|
// TODO: Implement input monitoring permission check in Phase 1
|
||||||
|
inputMonitoringStatus = .notDetermined
|
||||||
|
}
|
||||||
|
}
|
||||||
32
Sources/CoreSTT/STTEngine.swift
Normal file
32
Sources/CoreSTT/STTEngine.swift
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import Foundation
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public protocol STTEngine {
|
||||||
|
func transcribe(audioData: Data, language: String?) async throws -> String
|
||||||
|
func isModelLoaded() -> Bool
|
||||||
|
func loadModel(at path: URL) async throws
|
||||||
|
func unloadModel()
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum STTError: Error, LocalizedError {
|
||||||
|
case modelNotFound
|
||||||
|
case modelLoadFailed(String)
|
||||||
|
case transcriptionFailed(String)
|
||||||
|
case unsupportedFormat
|
||||||
|
case invalidAudioData
|
||||||
|
|
||||||
|
public var errorDescription: String? {
|
||||||
|
switch self {
|
||||||
|
case .modelNotFound:
|
||||||
|
return NSLocalizedString("error.model.not_found", comment: "Model not found error")
|
||||||
|
case .modelLoadFailed(let reason):
|
||||||
|
return NSLocalizedString("error.model.load_failed", comment: "Model load failed error") + ": \(reason)"
|
||||||
|
case .transcriptionFailed(let reason):
|
||||||
|
return NSLocalizedString("error.transcription.failed", comment: "Transcription failed error") + ": \(reason)"
|
||||||
|
case .unsupportedFormat:
|
||||||
|
return "Unsupported audio format"
|
||||||
|
case .invalidAudioData:
|
||||||
|
return "Invalid audio data"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
35
Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift
Normal file
35
Sources/CoreSTT/WhisperCPP/WhisperCPPEngine.swift
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
import Foundation
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public class WhisperCPPEngine: STTEngine {
|
||||||
|
private let logger = Logger(category: "WhisperCPPEngine")
|
||||||
|
private var modelPath: URL?
|
||||||
|
private var isLoaded = false
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
// WhisperCPP integration will be implemented in Phase 2
|
||||||
|
}
|
||||||
|
|
||||||
|
public func transcribe(audioData: Data, language: String?) async throws -> String {
|
||||||
|
logger.info("Transcribing audio data")
|
||||||
|
// TODO: Implement whisper.cpp integration in Phase 2
|
||||||
|
throw STTError.transcriptionFailed("Not implemented yet")
|
||||||
|
}
|
||||||
|
|
||||||
|
public func isModelLoaded() -> Bool {
|
||||||
|
return isLoaded
|
||||||
|
}
|
||||||
|
|
||||||
|
public func loadModel(at path: URL) async throws {
|
||||||
|
logger.info("Loading model at path: \(path.path)")
|
||||||
|
self.modelPath = path
|
||||||
|
// TODO: Implement model loading in Phase 2
|
||||||
|
isLoaded = true
|
||||||
|
}
|
||||||
|
|
||||||
|
public func unloadModel() {
|
||||||
|
logger.info("Unloading model")
|
||||||
|
modelPath = nil
|
||||||
|
isLoaded = false
|
||||||
|
}
|
||||||
|
}
|
||||||
149
Sources/CoreSettings/Settings.swift
Normal file
149
Sources/CoreSettings/Settings.swift
Normal file
|
|
@ -0,0 +1,149 @@
|
||||||
|
import Foundation
|
||||||
|
import CoreUtils
|
||||||
|
|
||||||
|
public enum HotkeyMode: String, CaseIterable, Codable {
|
||||||
|
case pushToTalk = "push_to_talk"
|
||||||
|
case toggle = "toggle"
|
||||||
|
|
||||||
|
public var displayName: String {
|
||||||
|
switch self {
|
||||||
|
case .pushToTalk:
|
||||||
|
return NSLocalizedString("preferences.general.mode.push_to_talk", comment: "Push to talk mode")
|
||||||
|
case .toggle:
|
||||||
|
return NSLocalizedString("preferences.general.mode.toggle", comment: "Toggle mode")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public struct HotkeyConfig: Codable {
|
||||||
|
public let keyCode: UInt32
|
||||||
|
public let modifiers: UInt32
|
||||||
|
|
||||||
|
public init(keyCode: UInt32, modifiers: UInt32) {
|
||||||
|
self.keyCode = keyCode
|
||||||
|
self.modifiers = modifiers
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to ⌘⇧V
|
||||||
|
public static let `default` = HotkeyConfig(keyCode: 9, modifiers: 768) // V key with Cmd+Shift
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Settings: ObservableObject {
|
||||||
|
private let logger = Logger(category: "Settings")
|
||||||
|
private let userDefaults = UserDefaults.standard
|
||||||
|
|
||||||
|
// General Settings
|
||||||
|
@Published public var hotkey: HotkeyConfig {
|
||||||
|
didSet { saveHotkey() }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published public var hotkeyMode: HotkeyMode {
|
||||||
|
didSet { saveHotkeyMode() }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published public var playSounds: Bool {
|
||||||
|
didSet { userDefaults.set(playSounds, forKey: "playSounds") }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published public var dictationTimeLimit: TimeInterval {
|
||||||
|
didSet { userDefaults.set(dictationTimeLimit, forKey: "dictationTimeLimit") }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model Settings
|
||||||
|
@Published public var activeModelName: String? {
|
||||||
|
didSet { userDefaults.set(activeModelName, forKey: "activeModelName") }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published public var forcedLanguage: String? {
|
||||||
|
didSet { userDefaults.set(forcedLanguage, forKey: "forcedLanguage") }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insertion Settings
|
||||||
|
@Published public var insertionMethod: String {
|
||||||
|
didSet { userDefaults.set(insertionMethod, forKey: "insertionMethod") }
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published public var showPreview: Bool {
|
||||||
|
didSet { userDefaults.set(showPreview, forKey: "showPreview") }
|
||||||
|
}
|
||||||
|
|
||||||
|
public init() {
|
||||||
|
// Load settings from UserDefaults
|
||||||
|
self.hotkey = Settings.loadHotkey()
|
||||||
|
self.hotkeyMode = HotkeyMode(rawValue: userDefaults.string(forKey: "hotkeyMode") ?? "") ?? .pushToTalk
|
||||||
|
self.playSounds = userDefaults.object(forKey: "playSounds") as? Bool ?? false
|
||||||
|
self.dictationTimeLimit = userDefaults.object(forKey: "dictationTimeLimit") as? TimeInterval ?? 600 // 10 minutes
|
||||||
|
self.activeModelName = userDefaults.string(forKey: "activeModelName")
|
||||||
|
self.forcedLanguage = userDefaults.string(forKey: "forcedLanguage")
|
||||||
|
self.insertionMethod = userDefaults.string(forKey: "insertionMethod") ?? "paste"
|
||||||
|
self.showPreview = userDefaults.object(forKey: "showPreview") as? Bool ?? false
|
||||||
|
|
||||||
|
logger.info("Settings initialized")
|
||||||
|
}
|
||||||
|
|
||||||
|
public func exportSettings() throws -> Data {
|
||||||
|
let settingsDict: [String: Any] = [
|
||||||
|
"hotkeyKeyCode": hotkey.keyCode,
|
||||||
|
"hotkeyModifiers": hotkey.modifiers,
|
||||||
|
"hotkeyMode": hotkeyMode.rawValue,
|
||||||
|
"playSounds": playSounds,
|
||||||
|
"dictationTimeLimit": dictationTimeLimit,
|
||||||
|
"activeModelName": activeModelName as Any,
|
||||||
|
"forcedLanguage": forcedLanguage as Any,
|
||||||
|
"insertionMethod": insertionMethod,
|
||||||
|
"showPreview": showPreview
|
||||||
|
]
|
||||||
|
|
||||||
|
return try JSONSerialization.data(withJSONObject: settingsDict, options: .prettyPrinted)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func importSettings(from data: Data) throws {
|
||||||
|
let settingsDict = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
|
||||||
|
|
||||||
|
if let keyCode = settingsDict["hotkeyKeyCode"] as? UInt32,
|
||||||
|
let modifiers = settingsDict["hotkeyModifiers"] as? UInt32 {
|
||||||
|
hotkey = HotkeyConfig(keyCode: keyCode, modifiers: modifiers)
|
||||||
|
}
|
||||||
|
|
||||||
|
if let modeString = settingsDict["hotkeyMode"] as? String,
|
||||||
|
let mode = HotkeyMode(rawValue: modeString) {
|
||||||
|
hotkeyMode = mode
|
||||||
|
}
|
||||||
|
|
||||||
|
if let sounds = settingsDict["playSounds"] as? Bool {
|
||||||
|
playSounds = sounds
|
||||||
|
}
|
||||||
|
|
||||||
|
if let timeLimit = settingsDict["dictationTimeLimit"] as? TimeInterval {
|
||||||
|
dictationTimeLimit = timeLimit
|
||||||
|
}
|
||||||
|
|
||||||
|
activeModelName = settingsDict["activeModelName"] as? String
|
||||||
|
forcedLanguage = settingsDict["forcedLanguage"] as? String
|
||||||
|
|
||||||
|
if let method = settingsDict["insertionMethod"] as? String {
|
||||||
|
insertionMethod = method
|
||||||
|
}
|
||||||
|
|
||||||
|
if let preview = settingsDict["showPreview"] as? Bool {
|
||||||
|
showPreview = preview
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Settings imported successfully")
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func loadHotkey() -> HotkeyConfig {
|
||||||
|
let keyCode = UserDefaults.standard.object(forKey: "hotkeyKeyCode") as? UInt32 ?? HotkeyConfig.default.keyCode
|
||||||
|
let modifiers = UserDefaults.standard.object(forKey: "hotkeyModifiers") as? UInt32 ?? HotkeyConfig.default.modifiers
|
||||||
|
return HotkeyConfig(keyCode: keyCode, modifiers: modifiers)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func saveHotkey() {
|
||||||
|
userDefaults.set(hotkey.keyCode, forKey: "hotkeyKeyCode")
|
||||||
|
userDefaults.set(hotkey.modifiers, forKey: "hotkeyModifiers")
|
||||||
|
}
|
||||||
|
|
||||||
|
private func saveHotkeyMode() {
|
||||||
|
userDefaults.set(hotkeyMode.rawValue, forKey: "hotkeyMode")
|
||||||
|
}
|
||||||
|
}
|
||||||
24
Sources/CoreUtils/AppState.swift
Normal file
24
Sources/CoreUtils/AppState.swift
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
public enum AppState: String, CaseIterable {
|
||||||
|
case idle = "idle"
|
||||||
|
case listening = "listening"
|
||||||
|
case processing = "processing"
|
||||||
|
case injecting = "injecting"
|
||||||
|
case error = "error"
|
||||||
|
|
||||||
|
public var displayName: String {
|
||||||
|
switch self {
|
||||||
|
case .idle:
|
||||||
|
return NSLocalizedString("menubar.idle", comment: "Idle state")
|
||||||
|
case .listening:
|
||||||
|
return NSLocalizedString("menubar.listening", comment: "Listening state")
|
||||||
|
case .processing:
|
||||||
|
return NSLocalizedString("menubar.processing", comment: "Processing state")
|
||||||
|
case .injecting:
|
||||||
|
return "Injecting" // Not shown in menu bar
|
||||||
|
case .error:
|
||||||
|
return "Error" // Not shown in menu bar
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
51
Sources/CoreUtils/Logger.swift
Normal file
51
Sources/CoreUtils/Logger.swift
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
import Foundation
|
||||||
|
import os.log
|
||||||
|
|
||||||
|
public enum LogLevel: String, CaseIterable {
|
||||||
|
case debug = "DEBUG"
|
||||||
|
case info = "INFO"
|
||||||
|
case warning = "WARNING"
|
||||||
|
case error = "ERROR"
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Logger {
|
||||||
|
private let osLog: OSLog
|
||||||
|
private let category: String
|
||||||
|
|
||||||
|
public init(category: String) {
|
||||||
|
self.category = category
|
||||||
|
self.osLog = OSLog(subsystem: "com.menuwhisper.app", category: category)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func debug(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
|
||||||
|
log(level: .debug, message: message, file: file, function: function, line: line)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func info(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
|
||||||
|
log(level: .info, message: message, file: file, function: function, line: line)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func warning(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
|
||||||
|
log(level: .warning, message: message, file: file, function: function, line: line)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func error(_ message: String, file: String = #file, function: String = #function, line: Int = #line) {
|
||||||
|
log(level: .error, message: message, file: file, function: function, line: line)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func log(level: LogLevel, message: String, file: String, function: String, line: Int) {
|
||||||
|
let fileName = URL(fileURLWithPath: file).lastPathComponent
|
||||||
|
let logMessage = "[\(category)] \(message) (\(fileName):\(function):\(line))"
|
||||||
|
|
||||||
|
switch level {
|
||||||
|
case .debug:
|
||||||
|
os_log("%{public}@", log: osLog, type: .debug, logMessage)
|
||||||
|
case .info:
|
||||||
|
os_log("%{public}@", log: osLog, type: .info, logMessage)
|
||||||
|
case .warning:
|
||||||
|
os_log("%{public}@", log: osLog, type: .default, logMessage)
|
||||||
|
case .error:
|
||||||
|
os_log("%{public}@", log: osLog, type: .error, logMessage)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
335
TECHSPEC.md
Normal file
335
TECHSPEC.md
Normal file
|
|
@ -0,0 +1,335 @@
|
||||||
|
# Technical Definition — “Menu-Whisper” (macOS, Swift, Offline STT)
|
||||||
|
|
||||||
|
## 0) Owner Decisions (Locked)
|
||||||
|
- **Platform:** Apple Silicon only (M1/M2/M3), macOS 13+.
|
||||||
|
- **STT backends:** Start with **whisper.cpp (Metal)** for simplicity; add **Core ML** backend later.
|
||||||
|
- **Models:** Do **not** auto-download. On first run, user **chooses & downloads** a model.
|
||||||
|
- **VAD:** Post-MVP.
|
||||||
|
- **Insertion behavior:** Configurable; **direct insertion** is default (no preview).
|
||||||
|
- **Default hotkey:** **⌘⇧V** (user-configurable).
|
||||||
|
- **Punctuation:** Let the model handle punctuation automatically (no spoken commands).
|
||||||
|
- **Privacy/Connectivity:** 100% local at runtime; model downloads only when the user explicitly requests. **No telemetry**.
|
||||||
|
- **Distribution:** **.app/.dmg** (signed + notarized), outside the Mac App Store initially.
|
||||||
|
- **UI languages:** **ES/EN**.
|
||||||
|
- **Low-power mode:** Still allow downloads if the user starts them.
|
||||||
|
- **License:** **MIT**.
|
||||||
|
- **Per-dictation limit:** **10 minutes** by default (configurable).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1) Goal
|
||||||
|
A **menu bar** app for macOS that performs **offline speech-to-text** using Whisper-family models and **inserts the transcribed text** into whichever app currently has focus. Shows a minimal **HUD** while listening and processing. No internet required during normal operation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2) MVP Scope
|
||||||
|
- Persistent **menu bar** item (NSStatusItem / `MenuBarExtra`).
|
||||||
|
- **Global hotkey** (push-to-talk and toggle modes).
|
||||||
|
- **HUD** (centered NSPanel + SwiftUI):
|
||||||
|
- “Listening” with audio-level animation (RMS/peak).
|
||||||
|
- “Processing” with a spinner/animation.
|
||||||
|
- **Offline STT** with **whisper.cpp** (GGUF models; Metal acceleration on Apple Silicon).
|
||||||
|
- **Model Manager**: curated list, manual download with progress + SHA256 check, user selection.
|
||||||
|
- **Text injection**:
|
||||||
|
- Preferred: **Clipboard + ⌘V** paste.
|
||||||
|
- Fallback: **simulated typing** via CGEvent.
|
||||||
|
- If **Secure Input** is active, **do not inject**; show notice and keep text on clipboard.
|
||||||
|
- **Preferences**: hotkey & mode, model & language, insertion method, HUD styling, sounds, dictation limit.
|
||||||
|
- **Permissions onboarding**: Microphone, Accessibility, Input Monitoring.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3) Functional Requirements
|
||||||
|
|
||||||
|
### 3.1 Capture
|
||||||
|
- Prompt for permissions on first use.
|
||||||
|
- Global hotkey (default ⌘⇧V).
|
||||||
|
- **Push-to-talk**: start on key down, stop on key up.
|
||||||
|
- **Toggle**: press to start, press again to stop.
|
||||||
|
- Per-dictation limit (default 10 min, range 10 s–30 min).
|
||||||
|
|
||||||
|
### 3.2 HUD / UX
|
||||||
|
- Non-activating, centered **NSPanel** (~320×160), no focus stealing.
|
||||||
|
- **Listening**: bar-style audio visualization driven by live RMS/peak.
|
||||||
|
- **Processing**: spinner + “Transcribing…” label.
|
||||||
|
- **Esc** to cancel.
|
||||||
|
- Optional start/stop sounds (user-toggleable).
|
||||||
|
|
||||||
|
### 3.3 STT
|
||||||
|
- Backend A (MVP): **whisper.cpp** with **GGUF** and **Metal**.
|
||||||
|
- Language: auto-detect or forced (persisted).
|
||||||
|
- Basic text normalization; punctuation from the model.
|
||||||
|
- UTF-8 output; standard replacements (quotes, dashes, etc.).
|
||||||
|
|
||||||
|
### 3.4 Injection
|
||||||
|
- Preferred method: **NSPasteboard** + **CGEvent** to send ⌘V.
|
||||||
|
- Fallback: **CGEventCreateKeyboardEvent** (character-by-character), respecting active keyboard layout.
|
||||||
|
- **Secure Input**: detect with `IsSecureEventInputEnabled()`; if enabled, **do not inject**. Show a non-intrusive notice and leave the text on the clipboard.
|
||||||
|
|
||||||
|
### 3.5 Preferences
|
||||||
|
- **General:** hotkey + mode (push/toggle), sounds, HUD options.
|
||||||
|
- **Models:** catalog, download, select active model, language, local storage path.
|
||||||
|
- **Insertion:** direct vs preview (preview **off** by default), paste vs type.
|
||||||
|
- **Advanced:** limits, performance knobs (threads/batch), **local** logs opt-in.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4) Non-Functional Requirements
|
||||||
|
- **Offline** execution after models are installed.
|
||||||
|
- **Latency target** (M1 + “small” model): < 4 s for 10 s of audio.
|
||||||
|
- **Memory target:** ~1.5–2.5 GB with “small”.
|
||||||
|
- **Privacy:** audio and text never leave the device.
|
||||||
|
- **Accessibility:** sufficient contrast; VoiceOver labels; focus never stolen by HUD.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5) Architecture (High-Level)
|
||||||
|
- **App (SwiftUI)** with AppKit bridges for NSStatusItem and NSPanel.
|
||||||
|
- **Shortcut Manager** (Carbon `RegisterEventHotKey` or HotKey/MASShortcut).
|
||||||
|
- **Audio**: AVAudioEngine (downsample to 16 kHz mono, 16-bit PCM).
|
||||||
|
- **STT Engine**:
|
||||||
|
- **whisper.cpp** (C/C++ via SPM/CMake) with Metal.
|
||||||
|
- **Core ML backend** (e.g., WhisperKit / custom) in a later phase.
|
||||||
|
- **Model Manager**: curated catalog, downloads (progress + SHA256), selection, caching.
|
||||||
|
- **Text Injection**: pasteboard + CGEvent; typing fallback; Secure Input detection.
|
||||||
|
- **Permissions Manager**: guided flows to System Settings panes.
|
||||||
|
- **Settings**: UserDefaults + JSON export/import.
|
||||||
|
- **Packaging**: .app + .dmg (signed & notarized).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6) Main Flow
|
||||||
|
1. User presses global hotkey.
|
||||||
|
2. Check permissions; guide if missing.
|
||||||
|
3. Show HUD → **Listening**; start capture.
|
||||||
|
4. Stop (key up/toggle/timeout).
|
||||||
|
5. HUD → **Processing**; run STT in background.
|
||||||
|
6. On result → (optional preview) → **insert** (paste) or **fallback** (type). If Secure Input, **do not inject**; keep in clipboard + show notice.
|
||||||
|
7. Close HUD → **Idle**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7) Finite State Machine (FSM)
|
||||||
|
- **Idle** → (Hotkey) → **Listening**
|
||||||
|
- **Listening** → (Stop/Timeout) → **Processing**
|
||||||
|
- **Processing** → (Done) → **Injecting**
|
||||||
|
- **Injecting** → (Done) → **Idle**
|
||||||
|
- Any → (Error) → **ErrorModal** → **Idle**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8) Model Management (Manual Downloads)
|
||||||
|
**Goal:** Offer a clear list of **free** Whisper-family models (names, sizes, languages, recommended backend) with one-click downloads. No automatic downloads.
|
||||||
|
|
||||||
|
### 8.1 OpenAI Whisper (official weights)
|
||||||
|
- Families: **tiny**, **base**, **small**, **medium**, **large-v2**, **large-v3** (multilingual; some `.en` variants).
|
||||||
|
- Usable with **whisper.cpp** via **GGUF** (community conversions widely available).
|
||||||
|
|
||||||
|
### 8.2 Whisper for whisper.cpp (converted GGUF)
|
||||||
|
- Community-maintained conversions for whisper.cpp (GGUF), optimized for CPU/GPU Metal on macOS.
|
||||||
|
|
||||||
|
### 8.3 Faster-Whisper (CTranslate2)
|
||||||
|
- Optimized variants (tiny/base/small/medium/large-v2/large-v3). Useful if a CT2-based or Core-ML-assisted backend is added later.
|
||||||
|
|
||||||
|
### 8.4 Distil-Whisper (distilled)
|
||||||
|
- Distilled models (e.g., **distil-large-v2/v3/v3.5**, **distil-small.en**), significantly smaller/faster with near-large accuracy.
|
||||||
|
|
||||||
|
> **UI must show:** model file size, languages, license, **RAM estimate**, and a warning if a large model is selected on lower-memory machines.
|
||||||
|
|
||||||
|
**Optional JSON Schema for catalog entries (for the app’s first-run picker):**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "whisper-small",
|
||||||
|
"family": "OpenAI-Whisper",
|
||||||
|
"format": "gguf",
|
||||||
|
"size_mb": 466,
|
||||||
|
"languages": ["multilingual"],
|
||||||
|
"recommended_backend": "whisper.cpp",
|
||||||
|
"quality_tier": "small",
|
||||||
|
"license": "MIT",
|
||||||
|
"sha256": "…",
|
||||||
|
"download_url": "…",
|
||||||
|
"notes": "Good balance of speed/accuracy on M1/M2."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9) Security & Permissions
|
||||||
|
|
||||||
|
* **Info.plist:** `NSMicrophoneUsageDescription`.
|
||||||
|
* **Accessibility & Input Monitoring:** required for CGEvent; provide clear step-by-step guidance and deep-links.
|
||||||
|
* **Secure Input:** check `IsSecureEventInputEnabled()`; **never** attempt to bypass. Provide help text to identify apps that enable it (password fields, 2FA prompts, etc.).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10) Performance
|
||||||
|
|
||||||
|
* Lazy-load and reuse model (warm cache).
|
||||||
|
* Real-time downsampling to 16 kHz mono; chunked streaming into backend.
|
||||||
|
* Configurable threads; prefer **Metal** path on Apple Silicon.
|
||||||
|
* “Fast path” tweaks for short clips (<15 s).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11) Logging & Privacy
|
||||||
|
|
||||||
|
* **No remote telemetry.**
|
||||||
|
* Local logs **opt-in** (timings, errors only). Never store audio/text unless user explicitly enables a debug flag.
|
||||||
|
* “Wipe local data” button (models remain unless the user removes them).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12) Internationalization
|
||||||
|
|
||||||
|
* UI in **Spanish** and **English** (Localizable.strings).
|
||||||
|
* STT multilingual; language auto or forced per user preference.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13) Testing (Minimum)
|
||||||
|
|
||||||
|
* macOS 13/14/15 on M1/M2/M3.
|
||||||
|
* Injection works in Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail.
|
||||||
|
* **Secure Input**: correctly detected; no injection; clipboard + notice.
|
||||||
|
* Meet latency target with **small** model on M1.
|
||||||
|
* Model download & selection flows (simulate network errors).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14) Phased Plan (AI-Deliverables)
|
||||||
|
|
||||||
|
### Phase 0 — Scaffolding (MVP-0)
|
||||||
|
|
||||||
|
**Goal:** Base project + menubar.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* SwiftUI app with `MenuBarExtra`, microphone icon, “Idle” state.
|
||||||
|
* `ARCHITECTURE.md` describing modules (Audio/STT/Injection/Models/Permissions/Settings).
|
||||||
|
* Build scripts and signing/notarization templates.
|
||||||
|
**DoD:** Compiles; menu bar item visible; SPM structure ready.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 1 — Hotkey + HUD + Audio (MVP-1)
|
||||||
|
|
||||||
|
**Goal:** Listening UX without real STT.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* Global hotkey (default ⌘⇧V) with **push** and **toggle**.
|
||||||
|
* NSPanel HUD (Listening/Processing) + **real** RMS bars from AVAudioEngine.
|
||||||
|
* Per-dictation limit (default 10 min).
|
||||||
|
**DoD:** Live meter responds to mic; correct state transitions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2 — STT via whisper.cpp (MVP-2)
|
||||||
|
|
||||||
|
**Goal:** Real offline transcription.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* **whisper.cpp** module (C/C++), background inference with **Metal**.
|
||||||
|
* **Model Manager** (curated list, download with SHA256, selection).
|
||||||
|
* Language auto/forced; basic normalization.
|
||||||
|
**DoD:** 10-second clip → coherent ES/EN text offline; meets timing targets.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 3 — Robust Insertion (MVP-3)
|
||||||
|
|
||||||
|
**Goal:** Reliable insertion into focused app.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* Paste (clipboard + ⌘V) and typing fallback.
|
||||||
|
* **Secure Input** detection; safe behavior (no injection, clipboard + notice).
|
||||||
|
**DoD:** Works across target apps; correct Secure Input handling.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 4 — Preferences + UX Polish (MVP-4)
|
||||||
|
|
||||||
|
**Goal:** Complete options & stability.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* Full Preferences (hotkey, modes, model, language, insertion, HUD, sounds).
|
||||||
|
* Optional preview dialog (off by default).
|
||||||
|
* Config export/import (JSON).
|
||||||
|
**DoD:** All settings persist and are honored.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 5 — Distribution (MVP-5)
|
||||||
|
|
||||||
|
**Goal:** Installable package.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* Error handling; permission prompts & help (incl. Secure Input troubleshooting).
|
||||||
|
* **.dmg** (signed + notarized) and install guide.
|
||||||
|
* **USER\_GUIDE.md** + **TROUBLESHOOTING.md**.
|
||||||
|
**DoD:** Clean install on test machines; distribution checklist passed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 6 — Core ML Backend (Post-MVP)
|
||||||
|
|
||||||
|
**Goal:** Second backend.
|
||||||
|
**Deliverables:**
|
||||||
|
|
||||||
|
* **Core ML** integration (e.g., WhisperKit or custom conversion).
|
||||||
|
* Backend selector (whisper.cpp/Core ML) in Preferences; local benchmarks table.
|
||||||
|
**DoD:** Feature parity and stability; documented pros/cons.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 15) Mini-Prompts for the Builder AI (per Phase)
|
||||||
|
|
||||||
|
* **P0:** “Create macOS 13+ SwiftUI menubar app (`MenuBarExtra`), microphone icon, SPM layout with modules in `ARCHITECTURE.md`.”
|
||||||
|
* **P1:** “Add global hotkey (push & toggle) with `RegisterEventHotKey`; NSPanel HUD with RMS bars from AVAudioEngine; 10-minute dictation limit.”
|
||||||
|
* **P2:** “Integrate **whisper.cpp** (Metal); add Model Manager (curated list, SHA256-verified downloads, selection); language auto/forced; transcribe WAV 16 kHz mono.”
|
||||||
|
* **P3:** “Implement insertion: pasteboard+⌘V and CGEvent typing fallback; detect `IsSecureEventInputEnabled()` and avoid injection.”
|
||||||
|
* **P4:** “Implement full Preferences, optional preview, JSON export/import; UX polish and messages.”
|
||||||
|
* **P5:** “Signing + notarization; produce .dmg; write USER\_GUIDE and TROUBLESHOOTING (with Secure Input section).”
|
||||||
|
* **P6:** “Add Core ML backend (WhisperKit/custom), backend selector, and local benchmarks.”
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 16) Suggested Repo Layout
|
||||||
|
|
||||||
|
```
|
||||||
|
MenuWhisper/
|
||||||
|
Sources/
|
||||||
|
App/ # SwiftUI + AppKit bridges
|
||||||
|
Core/
|
||||||
|
Audio/ # AVAudioEngine capture + meters
|
||||||
|
STT/
|
||||||
|
WhisperCPP/ # C/C++ wrapper + Metal path
|
||||||
|
CoreML/ # post-MVP
|
||||||
|
Models/ # catalog, downloads, hashes
|
||||||
|
Injection/ # clipboard, CGEvent typing, secure input checks
|
||||||
|
Permissions/
|
||||||
|
Settings/
|
||||||
|
Utils/
|
||||||
|
Resources/ # icons, sounds, localizations
|
||||||
|
Docs/ # ARCHITECTURE.md, USER_GUIDE.md, TROUBLESHOOTING.md
|
||||||
|
Scripts/ # build, sign, notarize
|
||||||
|
Tests/ # unit + integration
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 17) Risks & Mitigations
|
||||||
|
|
||||||
|
* **Hotkey collision (⌘⇧V)** with “Paste and Match Style” in some apps → make it discoverable & easily rebindable; warn on conflict.
|
||||||
|
* **Secure Input** blocks injection → inform the user, keep text on clipboard, provide help to identify the app enabling it.
|
||||||
|
* **RAM/latency** with large models → recommend **small/base** by default; show RAM/latency hints in the model picker.
|
||||||
|
* **Keyboard layouts** → prefer paste; if typing, map using the active layout.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 18) Global MVP Definition of Done
|
||||||
|
|
||||||
|
* A 30–90 s dictation yields accurate ES/EN text **offline** and inserts correctly in common apps.
|
||||||
|
* Secure Input is correctly detected and handled.
|
||||||
|
* Model download/selection is robust and user-driven.
|
||||||
|
* Shippable **.dmg** (signed + notarized) and clear docs included.
|
||||||
223
TODO.md
Normal file
223
TODO.md
Normal file
|
|
@ -0,0 +1,223 @@
|
||||||
|
```markdown
|
||||||
|
# TODO — Menu-Whisper (macOS, Swift, Offline STT)
|
||||||
|
|
||||||
|
This file tracks the tasks needed to deliver the app in **phases** with clear acceptance checks.
|
||||||
|
Conventions:
|
||||||
|
- `[ ]` = to do, `[x]` = done
|
||||||
|
- **AC** = Acceptance Criteria
|
||||||
|
- All features must work **offline** after models are installed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Global / Project-Wide
|
||||||
|
|
||||||
|
- [x] Set project license to **MIT** and add `LICENSE` file.
|
||||||
|
- [x] Add `README.md` with high-level summary, build requirements (Xcode, macOS 13+), Apple Silicon-only note.
|
||||||
|
- [x] Add `Docs/ARCHITECTURE.md` skeleton (to be filled in Phase 0).
|
||||||
|
- [x] Create base **localization** scaffolding (`en.lproj`, `es.lproj`) with `Localizable.strings`.
|
||||||
|
- [x] Add SwiftPM structure with separate targets for `App`, `Core/*` modules.
|
||||||
|
- [x] Prepare optional tooling:
|
||||||
|
- [x] SwiftFormat / SwiftLint config (opt-in).
|
||||||
|
- [x] GitHub Actions macOS runner for **build-only** CI (optional).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 0 — Scaffolding (MVP-0)
|
||||||
|
|
||||||
|
**Goal:** Base project + menu bar item; structure and docs.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [x] Create SwiftUI macOS app (macOS 13+) with `MenuBarExtra` / `NSStatusItem`.
|
||||||
|
- [x] Add placeholder mic icon (template asset).
|
||||||
|
- [x] Create module targets:
|
||||||
|
- [x] `Core/Audio`
|
||||||
|
- [x] `Core/STT` (with subfolders `WhisperCPP` and `CoreML` (stub))
|
||||||
|
- [x] `Core/Models`
|
||||||
|
- [x] `Core/Injection`
|
||||||
|
- [x] `Core/Permissions`
|
||||||
|
- [x] `Core/Settings`
|
||||||
|
- [x] `Core/Utils`
|
||||||
|
- [x] Wire a minimal state machine: `Idle` state shown in menubar menu.
|
||||||
|
- [x] Add scripts:
|
||||||
|
- [x] `Scripts/build.sh` (SPM/Xcodebuild)
|
||||||
|
- [x] `Scripts/notarize.sh` (stub with placeholders for later)
|
||||||
|
- [x] Write `Docs/ARCHITECTURE.md` (modules, data flow, FSM diagram).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [x] Project compiles and shows a **menu bar** icon with a basic menu.
|
||||||
|
- [x] Repo has clear structure and architecture doc.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1 — Hotkey + HUD + Audio (MVP-1)
|
||||||
|
|
||||||
|
**Goal:** Listening UX without real STT.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Implement **global hotkey** manager:
|
||||||
|
- [ ] Default **⌘⇧V** (configurable later).
|
||||||
|
- [ ] Support **push-to-talk** (start on key down, stop on key up).
|
||||||
|
- [ ] Support **toggle** (press to start, press to stop).
|
||||||
|
- [ ] Create **HUD** as non-activating centered `NSPanel`:
|
||||||
|
- [ ] State **Listening** with **RMS/peak bars** animation (SwiftUI view).
|
||||||
|
- [ ] State **Processing** with spinner/label.
|
||||||
|
- [ ] Dismiss/cancel with **Esc**.
|
||||||
|
- [ ] Implement **AVAudioEngine** capture:
|
||||||
|
- [ ] Tap on input bus; compute RMS/peak for visualization.
|
||||||
|
- [ ] Resample path ready for 16 kHz mono PCM (no STT yet).
|
||||||
|
- [ ] Add dictation **time limit** (default **10 min**, configurable later).
|
||||||
|
- [ ] Optional **sounds** for start/stop (toggle in settings later).
|
||||||
|
- [ ] Permissions onboarding:
|
||||||
|
- [ ] Request **Microphone** permission with Info.plist string.
|
||||||
|
- [ ] Show guide for **Accessibility** and **Input Monitoring** (no hard gating yet).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] Hotkey works in both modes (push/toggle) across desktop & full-screen apps.
|
||||||
|
- [ ] HUD appears centered; **Listening** shows live bars; **Processing** shows spinner.
|
||||||
|
- [ ] Cancel (Esc) reliably stops listening and hides HUD.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2 — STT via whisper.cpp (MVP-2)
|
||||||
|
|
||||||
|
**Goal:** Real offline transcription (Apple Silicon + Metal).
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Add **whisper.cpp** integration:
|
||||||
|
- [ ] Vendor/SwiftPM/Wrapper target for C/C++.
|
||||||
|
- [ ] Build with **Metal** path enabled on Apple Silicon.
|
||||||
|
- [ ] Define `STTEngine` protocol and `WhisperCPPSTTEngine` implementation.
|
||||||
|
- [ ] Audio pipeline:
|
||||||
|
- [ ] Convert captured audio to **16 kHz mono** 16-bit PCM.
|
||||||
|
- [ ] Chunking/streaming into STT worker; end-of-dictation triggers transcription.
|
||||||
|
- [ ] **Model Manager** (backend + minimal UI):
|
||||||
|
- [ ] Bundle a **curated JSON catalog** (name, size, languages, license, URL, SHA256).
|
||||||
|
- [ ] Download via `URLSession` with progress + resume support.
|
||||||
|
- [ ] Validate **SHA256**; store under `~/Library/Application Support/MenuWhisper/Models`.
|
||||||
|
- [ ] Allow **select active model**; persist selection.
|
||||||
|
- [ ] Language: **auto** or **forced** (persist).
|
||||||
|
- [ ] Text normalization pass (basic replacements; punctuation from model).
|
||||||
|
- [ ] Error handling (network failures, disk full, missing model).
|
||||||
|
- [ ] Performance knobs (threads, GPU toggle if exposed by backend).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] A **10 s** clip produces coherent **ES/EN** text **offline**.
|
||||||
|
- [ ] Latency target: **< 4 s** additional for 10 s clip on M1 with **small** model.
|
||||||
|
- [ ] Memory: ~**1.5–2.5 GB** with small model without leaks.
|
||||||
|
- [ ] Model download: progress UI + SHA256 verification + selection works.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3 — Robust Text Insertion (MVP-3)
|
||||||
|
|
||||||
|
**Goal:** Insert text into focused app safely; handle Secure Input.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Implement **Paste** method:
|
||||||
|
- [ ] Put text on **NSPasteboard** (general).
|
||||||
|
- [ ] Send **⌘V** via CGEvent to focused app.
|
||||||
|
- [ ] Implement **Typing** fallback:
|
||||||
|
- [ ] Generate per-character **CGEvent**; respect active keyboard layout.
|
||||||
|
- [ ] Handle `\n`, `\t`, and common unicode safely.
|
||||||
|
- [ ] Detect **Secure Input**:
|
||||||
|
- [ ] Use `IsSecureEventInputEnabled()` (or accepted API) check before injection.
|
||||||
|
- [ ] If enabled: **do not inject**; keep text on clipboard; show non-blocking notice.
|
||||||
|
- [ ] Add preference for **insertion method** (Paste preferred) + fallback strategy.
|
||||||
|
- [ ] Add **Permissions** helpers for Accessibility/Input Monitoring (deep links).
|
||||||
|
- [ ] Compatibility tests: Safari, Chrome, Notes, VS Code, Terminal, iTerm2, Mail.
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] Text reliably appears in the currently focused app via Paste.
|
||||||
|
- [ ] If Paste is blocked, Typing fallback works (except in Secure Input).
|
||||||
|
- [ ] When **Secure Input** is active: no injection occurs; clipboard contains the text; user is informed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4 — Preferences + UX Polish (MVP-4)
|
||||||
|
|
||||||
|
**Goal:** Complete options, localization, and stability.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Full **Preferences** window:
|
||||||
|
- [ ] Hotkey recorder (change ⌘⇧V if needed).
|
||||||
|
- [ ] Mode: Push-to-talk / Toggle.
|
||||||
|
- [ ] Model picker: list, **download**, **delete**, **set active**, show size/language/license.
|
||||||
|
- [ ] Language: Auto / Forced (dropdown).
|
||||||
|
- [ ] Insertion: **Direct** (default) vs **Preview**; Paste vs Typing preference.
|
||||||
|
- [ ] HUD: opacity/size, show/hide sounds toggles.
|
||||||
|
- [ ] Dictation limit: editable (default 10 min).
|
||||||
|
- [ ] Advanced: threads/batch; **local logs opt-in**.
|
||||||
|
- [ ] **Export/Import** settings (JSON).
|
||||||
|
- [ ] Implement **Preview** dialog (off by default): shows transcribed text with **Insert** / **Cancel**.
|
||||||
|
- [ ] Expand **localization** (ES/EN) for all UI strings.
|
||||||
|
- [ ] Onboarding & help views (permissions, Secure Input explanation).
|
||||||
|
- [ ] Persist all settings in `UserDefaults`; validate on load; migrate if needed.
|
||||||
|
- [ ] UX polish: icons, animation timing, keyboard navigation, VoiceOver labels.
|
||||||
|
- [ ] Optional: internal **timing instrumentation** (guarded by logs opt-in).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] All preferences persist and take effect without relaunch.
|
||||||
|
- [ ] Preview (when enabled) allows quick edit & insertion.
|
||||||
|
- [ ] ES/EN localization passes a manual spot-check.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5 — Distribution (MVP-5)
|
||||||
|
|
||||||
|
**Goal:** Shippable, signed/notarized .dmg, user docs.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Hardened runtime, entitlements, Info.plist:
|
||||||
|
- [ ] `NSMicrophoneUsageDescription`
|
||||||
|
- [ ] Review for any additional required entitlements.
|
||||||
|
- [ ] **Code signing** with Developer ID; set team identifiers.
|
||||||
|
- [ ] **Notarization** using `notarytool`; **staple** on success.
|
||||||
|
- [ ] Build **.app** and create **.dmg**:
|
||||||
|
- [ ] DMG background, /Applications symlink, icon.
|
||||||
|
- [ ] Write **Docs/USER_GUIDE.md** (first run, downloading models, dictation flow).
|
||||||
|
- [ ] Write **Docs/TROUBLESHOOTING.md** (permissions, Secure Input, model space/RAM issues).
|
||||||
|
- [ ] QA matrix:
|
||||||
|
- [ ] macOS **13/14/15**, Apple Silicon **M1/M2/M3**.
|
||||||
|
- [ ] Target apps list (insertion works).
|
||||||
|
- [ ] Offline check (network disabled).
|
||||||
|
- [ ] Prepare **VERSIONING** notes and changelog (semantic-ish).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] Signed & **notarized** .dmg installs cleanly.
|
||||||
|
- [ ] App functions **entirely offline** post-model download.
|
||||||
|
- [ ] Guides are complete and reference all common pitfalls.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 6 — Core ML Backend (Post-MVP)
|
||||||
|
|
||||||
|
**Goal:** Second STT backend and selector.
|
||||||
|
|
||||||
|
### Tasks
|
||||||
|
- [ ] Evaluate **Core ML** path (e.g., WhisperKit or custom Core ML models).
|
||||||
|
- [ ] Implement `STTEngineCoreML` conforming to `STTEngine` protocol.
|
||||||
|
- [ ] Backend **selector** in Preferences; runtime switching.
|
||||||
|
- [ ] Ensure **feature parity** (language settings, output normalization).
|
||||||
|
- [ ] **Benchmarks**: produce local latency/memory table across small/base/medium.
|
||||||
|
- [ ] Errors & fallbacks (if model missing, surface helpful guidance).
|
||||||
|
|
||||||
|
### AC
|
||||||
|
- [ ] Both backends run on Apple Silicon; user can switch backends.
|
||||||
|
- [ ] Comparable outputs; documented pros/cons and performance data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Backlog / Post-MVP Options
|
||||||
|
|
||||||
|
- [ ] **VAD (WebRTC)**: auto-stop on silence with thresholds.
|
||||||
|
- [ ] **Continuous dictation** with smart segmentation.
|
||||||
|
- [ ] **Noise suppression** and AGC in the audio pipeline.
|
||||||
|
- [ ] **Login item** (auto-launch at login).
|
||||||
|
- [ ] **Sparkle** or custom updater (if desirable outside App Store).
|
||||||
|
- [ ] **Settings profiles** (per-language/model presets).
|
||||||
|
- [ ] **In-app model catalog refresh** (remote JSON update).
|
||||||
|
- [ ] **Advanced insertion rules** (per-app behavior).
|
||||||
|
- [ ] **Analytics viewer** for local logs (no telemetry).
|
||||||
|
|
||||||
|
---
|
||||||
|
```
|
||||||
10
Tests/CoreAudioTests/AudioEngineTests.swift
Normal file
10
Tests/CoreAudioTests/AudioEngineTests.swift
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import MenuWhisperAudio
|
||||||
|
|
||||||
|
final class AudioEngineTests: XCTestCase {
|
||||||
|
func testAudioEngineInitialization() {
|
||||||
|
let engine = AudioEngine()
|
||||||
|
XCTAssertNotNil(engine)
|
||||||
|
XCTAssertFalse(engine.isCapturing)
|
||||||
|
}
|
||||||
|
}
|
||||||
9
Tests/CoreInjectionTests/TextInjectorTests.swift
Normal file
9
Tests/CoreInjectionTests/TextInjectorTests.swift
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CoreInjection
|
||||||
|
|
||||||
|
final class TextInjectorTests: XCTestCase {
|
||||||
|
func testTextInjectorInitialization() {
|
||||||
|
let injector = TextInjector()
|
||||||
|
XCTAssertNotNil(injector)
|
||||||
|
}
|
||||||
|
}
|
||||||
10
Tests/CoreModelsTests/ModelManagerTests.swift
Normal file
10
Tests/CoreModelsTests/ModelManagerTests.swift
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CoreModels
|
||||||
|
|
||||||
|
final class ModelManagerTests: XCTestCase {
|
||||||
|
func testModelManagerInitialization() {
|
||||||
|
let manager = ModelManager()
|
||||||
|
XCTAssertNotNil(manager)
|
||||||
|
XCTAssertEqual(manager.availableModels.count, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
9
Tests/CorePermissionsTests/PermissionManagerTests.swift
Normal file
9
Tests/CorePermissionsTests/PermissionManagerTests.swift
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CorePermissions
|
||||||
|
|
||||||
|
final class PermissionManagerTests: XCTestCase {
|
||||||
|
func testPermissionManagerInitialization() {
|
||||||
|
let manager = PermissionManager()
|
||||||
|
XCTAssertNotNil(manager)
|
||||||
|
}
|
||||||
|
}
|
||||||
10
Tests/CoreSTTTests/STTEngineTests.swift
Normal file
10
Tests/CoreSTTTests/STTEngineTests.swift
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CoreSTT
|
||||||
|
|
||||||
|
final class STTEngineTests: XCTestCase {
|
||||||
|
func testWhisperCPPEngineInitialization() {
|
||||||
|
let engine = WhisperCPPEngine()
|
||||||
|
XCTAssertNotNil(engine)
|
||||||
|
XCTAssertFalse(engine.isModelLoaded())
|
||||||
|
}
|
||||||
|
}
|
||||||
10
Tests/CoreSettingsTests/SettingsTests.swift
Normal file
10
Tests/CoreSettingsTests/SettingsTests.swift
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CoreSettings
|
||||||
|
|
||||||
|
final class SettingsTests: XCTestCase {
|
||||||
|
func testSettingsInitialization() {
|
||||||
|
let settings = Settings()
|
||||||
|
XCTAssertNotNil(settings)
|
||||||
|
XCTAssertEqual(settings.hotkeyMode, .pushToTalk)
|
||||||
|
}
|
||||||
|
}
|
||||||
17
Tests/CoreUtilsTests/LoggerTests.swift
Normal file
17
Tests/CoreUtilsTests/LoggerTests.swift
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
import XCTest
|
||||||
|
@testable import CoreUtils
|
||||||
|
|
||||||
|
final class LoggerTests: XCTestCase {
|
||||||
|
func testLoggerInitialization() {
|
||||||
|
let logger = Logger(category: "Test")
|
||||||
|
logger.info("Test message")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testLoggerLevels() {
|
||||||
|
let logger = Logger(category: "Test")
|
||||||
|
logger.debug("Debug message")
|
||||||
|
logger.info("Info message")
|
||||||
|
logger.warning("Warning message")
|
||||||
|
logger.error("Error message")
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue