{ "version": 2, "license": "CC-BY-4.0", "models": [ { "provider": "Deepgram", "provider_url": "https://deepgram.com", "model_id": "nova-3-monolingual", "display_name": "Nova-3 Monolingual", "price_per_minute_usd": "0.0048", "price_per_minute_batch_usd": "0.0043", "diarization_per_minute_usd": "0.002", "languages": ["en"], "streaming": true, "realtime": true, "diarization": "extra-cost", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://deepgram.com/pricing", "notes": "Pay-as-you-go tier (English). Growth tier (volume commitment) is $0.0042/min streaming, $0.0036/min pre-recorded. Diarization add-on is captured in diarization_per_minute_usd." }, { "provider": "Deepgram", "provider_url": "https://deepgram.com", "model_id": "nova-3-multilingual", "display_name": "Nova-3 Multilingual", "price_per_minute_usd": "0.0058", "price_per_minute_batch_usd": "0.0052", "diarization_per_minute_usd": "0.002", "languages": "61+", "streaming": true, "realtime": true, "diarization": "extra-cost", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://deepgram.com/pricing", "notes": "Pay-as-you-go tier (multilingual, ~61 languages). Growth tier is $0.0050/min streaming, $0.0043/min pre-recorded. Diarization add-on is captured in diarization_per_minute_usd." }, { "provider": "Deepgram", "provider_url": "https://deepgram.com", "model_id": "nova-3-medical", "display_name": "Nova-3 Medical", "price_per_minute_usd": "0.0048", "price_per_minute_batch_usd": "0.0043", "diarization_per_minute_usd": "0.002", "languages": [ "en", "en-US", "en-AU", "en-CA", "en-GB", "en-IE", "en-IN", "en-NZ" ], "streaming": true, "realtime": true, "diarization": "extra-cost", "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://deepgram.com/learn/introducing-nova-3-medical-speech-to-text-api", "notes": "Medical-tuned Nova-3 for clinical transcription. English variants only (en, en-US, en-AU, en-CA, en-GB, en-IE, en-IN, en-NZ). Invoked via `model=nova-3-medical` in the Deepgram API. Pricing is not separately listed on the public pricing page; Deepgram's launch announcement quotes $0.0043/min pre-recorded, which matches the Nova-3 Monolingual batch rate. Streaming rate assumed equal to Nova-3 Monolingual ($0.0048/min PAYG); verify with Deepgram sales for production commitments." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "universal-2", "display_name": "Universal-2", "price_per_minute_usd": "0.0025", "languages": "99+", "streaming": false, "realtime": false, "diarization": "extra-cost", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "Pre-recorded (file-based) only — broadest AssemblyAI language coverage. Published as $0.15/hr. Diarization add-on +$0.02/hr. For real-time use, see universal-streaming." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "universal-streaming", "display_name": "Universal-Streaming", "price_per_minute_usd": "0.0025", "languages": ["en"], "streaming": true, "realtime": true, "diarization": "extra-cost", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "English-only streaming model. Published as $0.15/hr. Higher-tier streaming (Universal-3 Pro Streaming) is $0.45/hr ($0.0075/min)." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "universal-3-pro", "display_name": "Universal-3 Pro", "price_per_minute_usd": "0.0035", "languages": ["en", "es", "pt", "de", "fr", "it"], "streaming": false, "realtime": false, "diarization": "extra-cost", "deployment_options": ["native"], "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "AssemblyAI's highest-accuracy pre-recorded model with native code-switching across EN/ES/PT/DE/FR/IT. Published as $0.21/hr. For real-time use see universal-3-pro-streaming." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "universal-3-pro-streaming", "display_name": "Universal-3 Pro Streaming", "price_per_minute_usd": "0.0075", "languages": ["en", "es", "pt", "de", "fr", "it"], "streaming": true, "realtime": true, "diarization": "extra-cost", "deployment_options": ["native"], "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "Real-time streaming variant of Universal-3 Pro, positioned for voice agents. Multilingual with native code-switching across EN/ES/PT/DE/FR/IT. Published as $0.45/hr." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "universal-streaming-multilingual", "display_name": "Universal-Streaming Multilingual", "price_per_minute_usd": "0.0025", "languages": ["en", "es", "pt", "de", "fr", "it"], "streaming": true, "realtime": true, "diarization": "extra-cost", "deployment_options": ["native"], "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "Multilingual streaming variant covering EN/ES/PT/DE/FR/IT at the same $0.15/hr rate as the English-only universal-streaming. Good balance of cost and latency for voice agents." }, { "provider": "AssemblyAI", "provider_url": "https://www.assemblyai.com", "model_id": "whisper-streaming", "display_name": "Whisper-Streaming", "price_per_minute_usd": "0.0050", "languages": "99+", "streaming": true, "realtime": true, "diarization": "extra-cost", "deployment_options": ["native"], "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.assemblyai.com/pricing", "notes": "OpenAI Whisper served via AssemblyAI's streaming infrastructure with 99+ language coverage. Published as $0.30/hr." }, { "provider": "Cartesia", "provider_url": "https://cartesia.ai", "model_id": "ink-1", "display_name": "Ink", "price_per_minute_usd": "0.003", "languages": "42+", "streaming": true, "realtime": true, "diarization": "unsupported", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cartesia.ai/pricing", "notes": "Cartesia's STT model. Published as $12 per 4000 minutes ($0.003/min). Same provider as the Sonic TTS model (real-time focused)." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "whisper-1", "display_name": "Whisper", "price_per_minute_usd": "0.006", "languages": "99+", "streaming": false, "realtime": false, "diarization": "unsupported", "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://openai.com/api/pricing/", "notes": "Billed to the nearest second. Pre-recorded only. openai.com pricing page returned 403 during automated fetch; rate cross-verified via OpenRouter (openai/whisper-1)." }, { "provider": "Groq", "provider_url": "https://groq.com", "model_id": "whisper-large-v3", "display_name": "Whisper V3 Large (Groq)", "price_per_minute_usd": "0.00185", "languages": "99+", "streaming": false, "realtime": false, "diarization": "unsupported", "min_billed_seconds": 10, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://groq.com/pricing/", "notes": "Whisper V3 Large hosted on Groq. Published as $0.111/hr. Speed factor 217x realtime." }, { "provider": "Groq", "provider_url": "https://groq.com", "model_id": "whisper-large-v3-turbo", "display_name": "Whisper Large v3 Turbo (Groq)", "price_per_minute_usd": "0.000667", "languages": "99+", "streaming": false, "realtime": false, "diarization": "unsupported", "min_billed_seconds": 10, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://groq.com/pricing/", "notes": "Whisper Large v3 Turbo hosted on Groq — faster variant. Published as $0.04/hr. Speed factor 228x realtime." }, { "provider": "Microsoft Azure", "provider_url": "https://azure.microsoft.com", "model_id": "azure-speech-realtime", "display_name": "Azure Speech (Real-time)", "price_per_minute_usd": "0.016667", "languages": "100+", "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["azure"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://azure.microsoft.com/en-us/pricing/details/speech/", "notes": "Azure Speech Standard (S0) real-time speech-to-text. Published as $1.00/hr pay-as-you-go (= $0.016667/min); custom real-time endpoint is $1.20/hr ($0.02/min). Commitment tiers reduce effective rate (2,000 hrs/mo $0.80/hr; 10,000 hrs/mo $0.65/hr; 50,000 hrs/mo $0.50/hr). Real-time diarization is included up to 240 min/session per Microsoft Learn quotas/limits doc. Languages: 100+ per Azure language support docs. Cross-verified via https://learn.microsoft.com/en-us/answers/questions/2155625/speech-to-text-costing-1-hr-is-crazy-no-bulk-avail." }, { "provider": "Microsoft Azure", "provider_url": "https://azure.microsoft.com", "model_id": "azure-speech-batch", "display_name": "Azure Speech (Batch)", "price_per_minute_usd": "0.006", "languages": "100+", "streaming": false, "realtime": false, "diarization": "included", "deployment_options": ["azure"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://azure.microsoft.com/en-us/pricing/details/speech/", "notes": "Azure Speech Standard (S0) batch transcription. Published as $0.36/hr (= $0.006/min); custom batch endpoint is $0.45/hr ($0.0075/min). Fast transcription (REST API, sync) is a separate $0.66/hr ($0.011/min) tier, not modelled here. Batch diarization is included (up to 240 min/file). Cross-verified via https://learn.microsoft.com/en-us/answers/questions/2155625/speech-to-text-costing-1-hr-is-crazy-no-bulk-avail." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "chirp_2", "display_name": "Chirp 2", "price_per_minute_usd": "0.016", "languages": "20+", "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/speech-to-text/pricing", "notes": "Google Cloud Speech-to-Text v2 multilingual model. Standard tier $0.016/min for both real-time and batch (down from v1's $0.024/min, per https://cloud.google.com/blog/products/ai-machine-learning/google-cloud-speech-to-text-v2-api). Dynamic Batch tier (up to 24h SLA) is 75% lower at $0.004/min — not modelled as price_per_minute_batch_usd because standard batch is the same as real-time. Standard volume tiers can reduce effective rate to as low as $0.004/min. Supports StreamingRecognize (~20 languages), Recognize, and BatchRecognize (broadest language coverage) per https://docs.cloud.google.com/speech-to-text/docs/models/chirp-2. GA in us-central1, europe-west4, asia-southeast1." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "chirp_3", "display_name": "Chirp 3", "price_per_minute_usd": "0.016", "languages": "98+", "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/speech-to-text/pricing", "notes": "Google Cloud Speech-to-Text v2 latest-generation generative ASR model. Standard tier $0.016/min for both real-time and batch; Dynamic Batch tier (up to 24h SLA) at $0.004/min (75% off) — not modelled as price_per_minute_batch_usd because standard batch is the same as real-time. Adds automatic language detection and diarization vs Chirp 2 per https://docs.cloud.google.com/speech-to-text/docs/models/chirp-3. 98+ languages and locales (24 GA + 74 preview); supports StreamingRecognize and BatchRecognize." }, { "provider": "Speechmatics", "provider_url": "https://www.speechmatics.com", "model_id": "enhanced", "display_name": "Speechmatics Enhanced", "price_per_minute_usd": "0.004", "languages": "55+", "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.speechmatics.com/pricing", "notes": "Speechmatics offers two operating points — `enhanced` (highest accuracy) and `standard` (faster/cheaper) — selected via the `operating_point` API parameter on both Batch and Real-time APIs. Pricing page lists Pro tier from $0.24/hr ($0.004/min) on PAYG with volume discounts above 500 hrs/month; the same tier is used for both real-time and batch. Free plan includes 480 minutes/month (not modelled as `free_tier` because schema expects per-day/per-token quotas). Confidence is medium because the pricing page exposes tier names rather than per-model SKU rates; verify against contract for production. Cross-verified product structure via https://docs.speechmatics.com/." }, { "provider": "Rev.ai", "provider_url": "https://www.rev.ai", "model_id": "whisper-fusion", "display_name": "Rev.ai Whisper Fusion", "price_per_minute_usd": "0.005", "languages": ["en"], "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.rev.ai/pricing", "notes": "Rev.ai's streaming transcription product, branded `Whisper Fusion` on the pricing page at $0.005/min; the parallel Whisper Large streaming tier is also $0.005/min. Free credits equivalent to 5 hours of Reverb ASR (cross-applicable across products). Reverb (batch) is modelled separately; see https://docs.rev.ai/ for the full API surface. English-primary; foreign language support is a distinct Reverb Foreign Language product line." }, { "provider": "Rev.ai", "provider_url": "https://www.rev.ai", "model_id": "reverb", "display_name": "Rev.ai Reverb", "price_per_minute_usd": "0.0033", "languages": ["en"], "streaming": false, "realtime": false, "diarization": "included", "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.rev.ai/pricing", "notes": "Rev.ai's async/batch ASR model branded `Reverb` at $0.20/hr ($0.0033/min). A `Reverb Turbo` tier exists at $0.10/hr ($0.0017/min) — not modelled as a separate row since it's a latency/quality dial on the same product; `Reverb Foreign Language` ($0.30/hr, $0.005/min, 57+ languages) is also priced separately and could be added if needed. Free credits equivalent to 5 hours of Reverb ASR. See https://docs.rev.ai/ for the async transcription API." }, { "provider": "Gladia", "provider_url": "https://www.gladia.io", "model_id": "solaria-1", "display_name": "Gladia Solaria-1", "price_per_minute_usd": "0.0125", "price_per_minute_batch_usd": "0.01017", "languages": "100+", "streaming": true, "realtime": true, "realtime_latency_ms": 300, "diarization": "included", "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.gladia.io/pricing", "notes": "Gladia's first-generation universal STT model `solaria-1`, supporting 100+ languages with automatic language detection and code-switching. Starter (PAYG) pricing: real-time $0.75/hr ($0.0125/min), async $0.61/hr (~$0.01017/min). Growth (committed) plan lowers real-time to $0.25/hr ($0.0042/min) and async to $0.20/hr ($0.0033/min). Sub-300ms streaming latency claimed on the pricing page. Speaker diarization and word-level timestamps included on all tiers. Starter includes 10 free hours per month. Model name confirmed via https://docs.gladia.io/." }, { "provider": "Soniox", "provider_url": "https://soniox.com", "model_id": "stt-rt-v4", "display_name": "Soniox STT Real-time v4", "price_per_minute_usd": "0.002", "languages": "60+", "streaming": true, "realtime": true, "diarization": "included", "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://soniox.com/pricing", "notes": "Soniox real-time STT model `stt-rt-v4`. Primary billing metric is input audio tokens at $2.00 per 1M tokens; vendor approximates ~$0.12/hour which we use as $0.002/min for comparability. Aliased from `stt-rt-v3` (deprecated 2026-02-05; removed 2026-02-28 per https://soniox.com/docs/stt/models). 60+ languages with automatic language detection. Confidence medium because per-minute is an approximation of token-based pricing; actual cost varies with audio content density." }, { "provider": "Soniox", "provider_url": "https://soniox.com", "model_id": "stt-async-v4", "display_name": "Soniox STT Async v4", "price_per_minute_usd": "0.00167", "languages": "60+", "streaming": false, "realtime": false, "diarization": "included", "max_audio_minutes_per_file": 300, "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://soniox.com/pricing", "notes": "Soniox async (file) STT model `stt-async-v4`. Primary billing metric is input audio tokens at $1.50 per 1M tokens; vendor approximates ~$0.10/hour which we use as $0.00167/min for comparability. Aliased from `stt-async-v3` (deprecated 2026-02-05; removed 2026-02-28 per https://soniox.com/docs/stt/models). Supports up to 5 hours of audio per request. 60+ languages with automatic language detection. Confidence medium because per-minute is an approximation of token-based pricing." } ] }