# Voxtype Configuration # # Location: ~/.config/voxtype/config.toml # All settings can be overridden via CLI flags # Transcription engine: "whisper" (default) or "parakeet" # Whisper: whisper.cpp via whisper-rs (most compatible) # Parakeet: NVIDIA FastConformer via ONNX Runtime (requires --features parakeet) # engine = "whisper" # State file for external integrations (Waybar, polybar, etc.) # Use "auto" for default location ($XDG_RUNTIME_DIR/voxtype/state), # a custom path, or "disabled" to turn off. The daemon writes state # ("idle", "recording", "transcribing") to this file whenever it changes. # Required for `voxtype record toggle` and `voxtype status` commands. state_file = "auto" [hotkey] # Built-in hotkey using evdev (Linux input subsystem) # # Most users should leave this disabled and use compositor keybindings instead: # - Hyprland: bind/bindr in hyprland.conf # - Sway: bindsym --no-repeat/--release in config # - River: riverctl map / map -release # # Enable this if you're on X11, using a compositor without key-release support, # or prefer a dedicated key like ScrollLock. Requires 'input' group membership. enabled = false # Key to hold for push-to-talk (when enabled = true) # Common choices: SCROLLLOCK, PAUSE, RIGHTALT, F13-F24 # Use `evtest` to find key names for your keyboard key = "SCROLLLOCK" # Optional modifier keys that must also be held # Example: modifiers = ["LEFTCTRL", "LEFTALT"] modifiers = [] # Activation mode: "push_to_talk" or "toggle" # - push_to_talk: Hold hotkey to record, release to transcribe (default) # - toggle: Press hotkey once to start recording, press again to stop # mode = "push_to_talk" [audio] # Audio input device ("default" uses system default) # List devices with: pactl list sources short device = "default" # Sample rate in Hz (whisper expects 16000) sample_rate = 16000 # Maximum recording duration in seconds (safety limit) max_duration_secs = 60 # [audio.feedback] # Enable audio feedback sounds (beeps when recording starts/stops) # enabled = true # # Sound theme: "default", "subtle", "mechanical", or path to custom theme directory # theme = "default" # # Volume level (0.0 to 1.0) # volume = 0.7 [whisper] # Execution mode: "local" (default) or "remote" # - local: Use whisper.cpp locally via FFI # - remote: Send audio to OpenAI-compatible API endpoint # mode = "local" # Model to use for transcription (local mode) # Options: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v3, large-v3-turbo # .en models are English-only but faster and more accurate for English # Or provide absolute path to a custom .bin model file model = "base.en" # Language for transcription # Use "en" for English, "auto" for auto-detection # See: https://github.com/openai/whisper#available-models-and-languages language = "en" # Translate non-English speech to English translate = false # Number of CPU threads for inference (omit for auto-detection) # threads = 4 # Load model on-demand when recording starts (true) or keep loaded (false) # When true, model is loaded when recording starts and unloaded after transcription # When false, model is kept in memory for faster response times (default) on_demand_loading = false # GPU memory isolation mode # Enable on laptops with hybrid graphics to let dGPU sleep between transcriptions. # gpu_isolation = true # Context window optimization for short recordings (disabled by default) # Speeds up transcription for clips under 22.5 seconds. Disabled by default # because some models (especially large-v3/turbo) may experience repetition # loops. Enable if you want faster transcription and don't experience issues. # context_window_optimization = true # --- Remote mode settings (used when mode = "remote") --- # remote_endpoint = "http://192.168.1.100:8080" # Required # remote_model = "whisper-1" # Model to request from server # remote_api_key = "sk-..." # Or use VOXTYPE_WHISPER_API_KEY env var # remote_timeout_secs = 30 # --- CLI mode settings (used when mode = "cli") --- # Uses whisper-cli subprocess instead of whisper-rs FFI bindings. # Fallback for systems where whisper-rs crashes (e.g., glibc 2.42+ on Ubuntu 25.10). # Requires whisper-cli from whisper.cpp: https://github.com/ggerganov/whisper.cpp # whisper_cli_path = "/usr/local/bin/whisper-cli" # Optional, searches PATH if not set # [parakeet] # Parakeet configuration (only used when engine = "parakeet") # Requires: cargo build --features parakeet # # Model name (from ~/.local/share/voxtype/models/) or absolute path # model = "parakeet-tdt-0.6b-v3" # # Model type: "tdt" (recommended, proper punctuation) or "ctc" (faster, char-level) # Auto-detected from model directory if not specified # model_type = "tdt" # # on_demand_loading = false [output] # Primary output mode: "type", "clipboard", or "paste" # - type: Simulates keyboard input at cursor position (requires wtype or ydotool) # - clipboard: Copies text to clipboard (requires wl-copy) # - paste: Copies to clipboard then simulates paste keystroke (requires wl-copy and wtype or ydotool) mode = "type" # Fall back to clipboard if typing fails fallback_to_clipboard = true # Delay between typed characters in milliseconds # 0 = fastest possible, increase if characters are dropped type_delay_ms = 0 # Delay before typing starts (ms) # Allows virtual keyboard to initialize. Some users report this helps prevent # the first character from being dropped on text insertion. Try 100-200ms. # Note: When using compositor integration (via `voxtype setup compositor`), # best results come from not binding Escape in the submap. Some users have # had success with Escape bound by increasing this delay, but the most # consistent fix is to use F12 or another key instead. pre_type_delay_ms = 0 # Keystroke for paste mode (when mode = "paste") # Default is "ctrl+v". Change for environments with different paste shortcuts. # Examples: # paste_keys = "ctrl+v" # Standard (default) # paste_keys = "shift+insert" # Hyprland/Omarchy universal paste # paste_keys = "ctrl+shift+v" # Some terminal emulators # paste_keys = "ctrl+v" # Compositor integration hooks # Use `voxtype setup compositor hyprland|sway|river` for automatic setup. # # pre_recording_command: Runs when recording starts. Switch to a submap/mode # where F12 can cancel recording/transcription. # pre_output_command: Runs before typing output. Switch to a submap/mode that # blocks modifier keys from triggering compositor shortcuts during typing. # post_output_command: Runs after typing output. Reset to normal mode. # # Example (Hyprland): # pre_recording_command = "hyprctl dispatch submap voxtype_recording" # pre_output_command = "hyprctl dispatch submap voxtype_suppress" # post_output_command = "hyprctl dispatch submap reset" [output.notification] # Show notification when recording starts (hotkey pressed) on_recording_start = false # Show notification when recording stops (transcription beginning) on_recording_stop = false # Show notification with transcribed text after transcription completes on_transcription = true # [output.post_process] # Pipe transcribed text through an external command for cleanup before output. # The command receives text on stdin and outputs processed text on stdout. # Useful for LLM-based text cleanup, grammar correction, filler word removal. # On any failure (timeout, error), falls back to original transcription. # # command = "ollama run llama3.2:1b 'Clean up this dictation. Fix grammar, remove filler words. Output only the cleaned text:'" # timeout_ms = 30000 # 30 second timeout (generous for LLM) # [text] # Text processing options (word replacements, spoken punctuation) # # Enable spoken punctuation conversion (e.g., say "period" to get ".") # spoken_punctuation = false # # Custom word replacements (case-insensitive) # replacements = { "vox type" = "voxtype" } [status] # Status display icons for Waybar/tray integrations # # Icon theme (or path to custom theme file): # Font-based (require specific fonts): # - "emoji" - Default emoji icons (🎙️ 🎤 ⏳) # - "nerd-font" - Nerd Font icons (requires Nerd Font) # - "material" - Material Design Icons (requires MDI font) # - "phosphor" - Phosphor Icons (requires Phosphor font) # - "codicons" - VS Code icons (requires Codicons font) # - "omarchy" - Omarchy distro icons # Universal (no special fonts needed): # - "minimal" - Simple Unicode (○ ● ◐ ×) # - "dots" - Geometric shapes (◯ ⬤ ◔ ◌) # - "arrows" - Media player style (▶ ● ↻ ■) # - "text" - Plain text ([MIC] [REC] [...] [OFF]) icon_theme = "emoji" # # Per-state icon overrides (optional, takes precedence over theme) # [status.icons] # idle = "🎙️" # recording = "🎤" # transcribing = "⏳" # stopped = ""