{ "version": 2, "license": "CC-BY-4.0", "models": [ { "provider": "ElevenLabs", "provider_url": "https://elevenlabs.io", "model_id": "eleven_flash_v2_5", "display_name": "Eleven Flash v2.5", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "32+", "ssml_supported": false, "voice_cloning": true, "output_formats": [ "mp3_44100_128", "pcm_16000", "wav_44100", "opus_48000_128", "ulaw_8000", "alaw_8000" ], "time_to_first_byte_ms": 75, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://elevenlabs.io/pricing/api", "notes": "Current low-latency flagship; eleven_turbo_v2_5 is deprecated and replaced by Flash v2.5. Pay-as-you-go rate $0.05/1K chars. ~10K voices available. Plain text input only (no SSML)." }, { "provider": "ElevenLabs", "provider_url": "https://elevenlabs.io", "model_id": "eleven_multilingual_v2", "display_name": "Eleven Multilingual v2", "price_per_1m_chars_usd": "100.0", "voice_quality": "neural", "languages": "29+", "ssml_supported": false, "voice_cloning": true, "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://elevenlabs.io/docs/overview/models", "notes": "High-quality professional model for audiobooks, video narration, and rich emotional expression. 29 languages, max 10,000 chars per request. Pay-as-you-go billed at 1 credit per character; Flash/Turbo v2.5 are billed at 0.5 credits/char (hence 2x the Flash $50/1M-chars rate). Higher latency than Flash; not recommended for real-time agents." }, { "provider": "ElevenLabs", "provider_url": "https://elevenlabs.io", "model_id": "eleven_v3", "display_name": "Eleven v3", "price_per_1m_chars_usd": "100.0", "voice_quality": "neural", "languages": "70+", "ssml_supported": false, "emotion_control_supported": true, "voice_cloning": true, "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://elevenlabs.io/docs/overview/models", "notes": "Most expressive ElevenLabs TTS model (GA after alpha). 70+ languages, max 5,000 chars per request. Supports inline audio tags ([whispers], [sighs], [laughs], [happily]) for emotion/delivery control instead of SSML. Higher latency than Flash/Turbo v2.5 — ElevenLabs explicitly recommends v2.5 Flash/Turbo for real-time use. Pay-as-you-go billed at 1 credit/char (same multiplier as Multilingual v2). PVCs (professional voice clones) not yet fully optimized for v3." }, { "provider": "ElevenLabs", "provider_url": "https://elevenlabs.io", "model_id": "eleven_turbo_v2_5", "display_name": "Eleven Turbo v2.5", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "32+", "ssml_supported": false, "voice_cloning": true, "deprecated_at": "2026-05-19", "replaced_by_model_id": "eleven_flash_v2_5", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://elevenlabs.io/docs/overview/models", "notes": "Deprecated per ElevenLabs models page — outclassed by and replaced by eleven_flash_v2_5. Still callable but not recommended for new applications. No official sunset date published; deprecated_at reflects verification date. Pay-as-you-go billed at 0.5 credits/char (same as Flash v2.5)." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "tts-1", "display_name": "TTS-1", "price_per_1m_chars_usd": "15.0", "voice_quality": "neural", "languages": ["en"], "ssml_supported": false, "voice_cloning": false, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://openai.com/api/pricing/", "notes": "Standard quality. The newer gpt-4o-mini-tts model is also available in OpenAI's API; consider adding when its pricing structure stabilizes." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "tts-1-hd", "display_name": "TTS-1 HD", "price_per_1m_chars_usd": "30.0", "voice_quality": "neural", "languages": ["en"], "ssml_supported": false, "voice_cloning": false, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://openai.com/api/pricing/", "notes": "High-definition tier — 2x the price of tts-1 for higher quality output." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-4o-mini-tts", "display_name": "GPT-4o mini TTS", "price_per_1m_chars_usd": "20.0", "voice_quality": "neural", "languages": ["en"], "ssml_supported": false, "voice_cloning": false, "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-4o-mini-tts", "notes": "OpenAI's newer GPT-4o-based TTS. Native pricing is token-based — $0.60/1M text-input tokens + $12/1M audio-output tokens — not per character. OpenAI's published estimate is ~$0.015 per minute of audio; converted to ~$20/1M chars assuming ~150 WPM (~750 chars/min) for consistency with the Cartesia row. Actual $/1M chars varies with speech rate and language. Supports voice steering via natural-language instructions (style/emotion) instead of SSML. Latest snapshot gpt-4o-mini-tts-2025-12-15. Max input 2,000 tokens per request." }, { "provider": "Cartesia", "provider_url": "https://cartesia.ai", "model_id": "sonic-3.5", "display_name": "Sonic 3.5", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "42+", "ssml_supported": false, "voice_cloning": true, "output_formats": [ "raw/pcm_f32le", "raw/pcm_s16le", "raw/pcm_mulaw", "raw/pcm_alaw", "wav", "mp3" ], "time_to_first_byte_ms": 90, "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cartesia.ai/pricing", "notes": "Cartesia publishes Sonic pricing as $1 per 25 minutes of audio output (~$0.04/min). Converted to ~$50/1M chars assuming ~150 WPM (~750 chars/min). Actual $/1M chars varies with speech rate; verify by sampling. IVC voice cloning included (no clone fee). 90ms TTFB. SSML tags (speed/volume/break/spell/emotion) are documented on sonic-3 but temporarily disabled on sonic-3.5 per https://docs.cartesia.ai/build-with-cartesia/sonic-3/ssml-tags (checked 2026-05-16); flip ssml_supported back to true once upstream re-enables." }, { "provider": "Cartesia", "provider_url": "https://cartesia.ai", "model_id": "sonic-2", "display_name": "Sonic 2", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "15+", "ssml_supported": false, "voice_cloning": true, "time_to_first_byte_ms": 90, "replaced_by_model_id": "sonic-3.5", "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.cartesia.ai/build-with-cartesia/tts-models/older-models", "notes": "Predecessor to sonic-3.5; still stable and callable, but Cartesia recommends sonic-3.5 for new builds. Latest snapshot sonic-2-2025-06-11. 8 core stable languages (en, fr, de, es, pt, zh, ja, ko); 7 additional languages reach EOL 2026-06-01. 90ms model latency. Higher-fidelity voice cloning capability. Pricing assumed equal to sonic-3.5 (15 credits/sec of audio); confidence medium because Cartesia did not publish a separate per-model rate. Verify by sampling if cost-critical." }, { "provider": "Cartesia", "provider_url": "https://cartesia.ai", "model_id": "sonic-turbo", "display_name": "Sonic Turbo", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "15+", "ssml_supported": false, "voice_cloning": true, "time_to_first_byte_ms": 40, "replaced_by_model_id": "sonic-3.5", "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.cartesia.ai/build-with-cartesia/tts-models/older-models", "notes": "Lowest-latency Sonic variant (~40ms TTFB). Still stable and callable, but Cartesia recommends sonic-3.5 for new builds. Latest snapshot sonic-turbo-2025-06-04. 9 stable languages; 6 additional languages reach EOL 2026-06-01. Pricing assumed equal to sonic-3.5 (15 credits/sec of audio); confidence medium because Cartesia did not publish a separate per-model rate. Verify by sampling if cost-critical." }, { "provider": "Groq", "provider_url": "https://groq.com", "model_id": "canopy-labs-orpheus-english", "display_name": "Canopy Labs Orpheus English (Groq)", "price_per_1m_chars_usd": "22.0", "voice_quality": "neural", "languages": ["en"], "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://groq.com/pricing/", "notes": "Hosted on Groq. Output speed ~100 characters/second." }, { "provider": "Groq", "provider_url": "https://groq.com", "model_id": "canopy-labs-orpheus-arabic-saudi", "display_name": "Canopy Labs Orpheus Arabic Saudi (Groq)", "price_per_1m_chars_usd": "40.0", "voice_quality": "neural", "languages": ["ar-SA"], "last_verified": "2026-05-05", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://groq.com/pricing/", "notes": "Hosted on Groq. Saudi Arabic variant. Output speed ~100 characters/second." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "google-tts-studio", "display_name": "Google Cloud TTS — Studio", "price_per_1m_chars_usd": "160.0", "voice_quality": "neural", "languages": "40+", "ssml_supported": true, "voice_cloning": false, "deployment_options": ["native", "vertex"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/text-to-speech/pricing", "notes": "Google's premium TTS tier for professional media production (long-form narration, advertising). Vendor price $0.000160/char = $160/1M chars; the single-speaker Studio class is GA and the multispeaker class is experimental per https://docs.cloud.google.com/text-to-speech/docs/voices. SSML supported except , , , and . Model_id is a Hail-coined tier slug (Google bills per-voice-tier rather than per API model name). Free tier: first 100K chars/month included (not representable as tokens_per_day in schema)." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "google-tts-neural2", "display_name": "Google Cloud TTS — Neural2", "price_per_1m_chars_usd": "16.0", "voice_quality": "neural", "languages": "40+", "ssml_supported": true, "voice_cloning": false, "deployment_options": ["native", "vertex"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/text-to-speech/pricing", "notes": "Google's recommended general-purpose neural tier; same per-char rate as WaveNet but newer architecture. Vendor price $0.000016/char = $16/1M chars. SSML fully supported. Model_id is a Hail-coined tier slug (Google bills per-voice-tier rather than per API model name). Free tier: first 1M chars/month included (not representable as tokens_per_day in schema)." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "google-tts-wavenet", "display_name": "Google Cloud TTS — WaveNet", "price_per_1m_chars_usd": "16.0", "voice_quality": "neural", "languages": "40+", "ssml_supported": true, "voice_cloning": false, "deployment_options": ["native", "vertex"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/text-to-speech/pricing", "notes": "Original neural-net voice family from DeepMind; not deprecated as of 2026-05-19 per Cloud TTS release notes (https://docs.cloud.google.com/text-to-speech/docs/release-notes) but Google recommends Neural2 for new projects at the same $16/1M chars rate. SSML fully supported. Model_id is a Hail-coined tier slug. Free tier: first 1M chars/month included." }, { "provider": "Google", "provider_url": "https://cloud.google.com", "model_id": "google-tts-chirp-3-hd", "display_name": "Google Cloud TTS — Chirp 3: HD", "price_per_1m_chars_usd": "30.0", "voice_quality": "neural", "languages": "30+", "ssml_supported": false, "streaming_supported": true, "voice_cloning": false, "deployment_options": ["native", "vertex"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cloud.google.com/text-to-speech/pricing", "notes": "Google's newest-generation TTS family with 30 voice styles in 30+ languages. Vendor price $0.000030/char = $30/1M chars. Per https://docs.cloud.google.com/text-to-speech/docs/chirp3-hd, Chirp 3: HD explicitly does NOT support SSML, speaking-rate adjustments, or pitch parameters; streaming synthesis IS supported. Model_id is a Hail-coined tier slug. Free tier: first 1M chars/month included." }, { "provider": "Microsoft Azure", "provider_url": "https://azure.microsoft.com", "model_id": "azure-tts-neural", "display_name": "Azure AI Speech — Neural", "price_per_1m_chars_usd": "16.0", "voice_quality": "neural", "languages": "100+", "ssml_supported": true, "streaming_supported": true, "voice_cloning": false, "deployment_options": ["azure"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services/", "notes": "Azure's standard neural TTS tier (called 'Neural' on the pricing page; 'Standard voice' in docs). 500+ prebuilt voices across 100+ locales per https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech. S0 pay-as-you-go price $16/1M chars for both real-time and batch synthesis (HD, AOAI, Custom Neural Voice, and Personal Voice priced separately). Full SSML support. Chinese characters counted as 2 chars for billing. Free tier (F0): 500K chars/month." }, { "provider": "Microsoft Azure", "provider_url": "https://azure.microsoft.com", "model_id": "azure-tts-hd", "display_name": "Azure AI Speech — Neural HD (DragonHD)", "aliases": ["dragon-hd"], "price_per_1m_chars_usd": "22.0", "voice_quality": "neural", "voices_count": 30, "languages": ["en-US", "zh-CN", "de-DE", "es-ES", "fr-FR", "ja-JP"], "ssml_supported": false, "emotion_control_supported": true, "streaming_supported": true, "voice_cloning": false, "deployment_options": ["azure"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-03-01", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services/", "notes": "Azure's premium HD neural tier (DragonHD architecture, 30+ GA voices). Per https://techcommunity.microsoft.com/blog/azure-ai-foundry-blog/azure-speech-%E2%80%93-neural-hd-text-to-speech-recent-voice-updates/4505380, Azure reduced Neural HD pricing to $22/1M chars effective March 2026 (down from $30/1M). Latency <300ms, real-time only. SSML support is partial (no , , ); we mark ssml_supported=false because the elements most callers want are unsupported. Automatic emotion/sentiment detection drives delivery (emotion_control_supported=true). DragonHDOmni (700+ voices, mixed GA/preview) and DragonHDFlash (en-US/zh-CN only) are distinct models tracked separately if added later. Confidence medium because the pricing-page value was sourced via third-party recap (techcommunity blog) — verify on the live Azure pricing page before high-volume use." }, { "provider": "Inworld", "provider_url": "https://inworld.ai", "model_id": "inworld-tts-2", "display_name": "Inworld Realtime TTS-2", "price_per_1m_chars_usd": "35.0", "voice_quality": "neural", "languages": "100+", "streaming_supported": true, "voice_cloning": true, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://inworld.ai/pricing", "notes": "Inworld's newest TTS model (Research Preview) with natural-language steering and 100+ language support. Base PAYG rate $35/1M chars; Developer tier $30, Growth $25, Enterprise as low as $10/1M. ~200ms latency. Instant voice cloning, custom pronunciation, timestamp alignment, and zero data retention included. The older inworld-tts-1 and inworld-tts-1-max are deprecated per https://docs.inworld.ai/tts/tts-models — migrate to inworld-tts-1.5-max or inworld-tts-1.5-mini if -2's research-preview status is a concern." }, { "provider": "Smallest.ai", "provider_url": "https://smallest.ai", "model_id": "lightning-v3.1", "display_name": "Lightning v3.1", "price_per_1m_chars_usd": "25.0", "voice_quality": "neural", "voices_count": 217, "languages": [ "en", "hi", "es", "mr", "kn", "ta", "bn", "gu", "te", "ml", "pa", "or" ], "streaming_supported": true, "voice_cloning": true, "output_formats": ["pcm", "wav", "mp3", "mulaw"], "sample_rates_hz": [8000, 16000, 24000, 44100], "time_to_first_byte_ms": 200, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://smallest.ai/pricing", "notes": "Smallest.ai's current TTS model (44.1 kHz native, ~200ms TTFB). Vendor rate ~$0.25/10k chars = $25/1M chars. 217 voices across 12 languages: English, Hindi, Spanish, and 9 Indian languages (Marathi, Kannada, Tamil, Bengali, Gujarati, Telugu, Malayalam, Punjabi, Odia). Lightning v2 and lightning-large are deprecated per https://docs.smallest.ai/waves/documentation/getting-started/models — new integrations should use lightning-v3.1. WebSocket streaming for real-time/conversational use. Instant + professional voice cloning supported. On-prem available on Enterprise plan." }, { "provider": "Rime", "provider_url": "https://rime.ai", "model_id": "mistv3", "display_name": "Rime Mist v3", "price_per_1m_chars_usd": "30.0", "voice_quality": "neural", "voices_count": 94, "languages": ["en"], "streaming_supported": true, "voice_cloning": false, "time_to_first_byte_ms": 100, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://rime.ai/pricing", "notes": "Rime's current Mist-family flagship; English-only with sub-100ms TTFB. Pricing-page rate $0.03/1K chars = $30/1M chars (the 'Mist' line on https://rime.ai/pricing). 94 voices. Custom pronunciation is on mistv2 but not yet on mistv3 per https://docs.rime.ai/api-reference/models. Coda ($0.05/1K = $50/1M, 184 voices, 6 languages incl. ES/FR/PT/DE/JA) and Arcana ($0.04/1K = $40/1M, multilingual, 94 voices) are distinct higher-tier models tracked separately if added later. Voice cloning not documented for Mist; available via Enterprise plan for custom voices." }, { "provider": "LMNT", "provider_url": "https://www.lmnt.com", "model_id": "blizzard", "display_name": "LMNT Blizzard", "price_per_1m_chars_usd": "50.0", "voice_quality": "neural", "languages": "31+", "streaming_supported": true, "voice_cloning": true, "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.lmnt.com/pricing", "notes": "LMNT's flagship Blizzard 2.0 model (canonical model_id 'blizzard' per https://docs.lmnt.com/models/overview). 31 languages with accent control, word timestamps, streaming, voice cloning, and speech sessions. Confidence medium because LMNT publishes plan-bundled pricing (Indie $10/mo for 200K chars + $0.05/1K overage; Pro $49/mo + $0.045/1K overage; Premium $199/mo + $0.035/1K overage) rather than a standalone PAYG per-char rate — $50/1M shown here is the Indie-tier overage rate. Free tier includes 15K characters/month with no overage rate (not representable as tokens_per_day in schema). Premium tier overage is $0.035/1K = $35/1M — large customers should benchmark on their own plan." }, { "provider": "Deepgram", "provider_url": "https://deepgram.com", "model_id": "aura-2", "display_name": "Deepgram Aura 2", "price_per_1m_chars_usd": "30.0", "voice_quality": "neural", "languages": ["en", "es", "de", "fr", "nl", "it", "ja"], "streaming_supported": true, "voice_cloning": false, "output_formats": ["wav", "mp3", "linear16", "mulaw", "alaw", "opus"], "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://deepgram.com/pricing", "notes": "Deepgram's current TTS family, addressed as 'aura-2--' (e.g., aura-2-thalia-en) per https://developers.deepgram.com/docs/tts-models. Vendor rate $0.030/1K chars = $30/1M chars on Pay-As-You-Go; Growth tier $0.027/1K = $27/1M. Voice counts by language: en 40+ (incl. Aura 1 legacy), es 15+ (Early Access), nl 8, fr 2, de 7, it 10, ja 4. Free tier ships $200 of signup credit applicable to all products (not representable as tokens_per_day in schema). Aura 1 voices remain callable but Deepgram recommends Aura 2 for new integrations." }, { "provider": "Resemble AI", "provider_url": "https://www.resemble.ai", "model_id": "chatterbox-turbo", "display_name": "Resemble Chatterbox Turbo", "price_per_second_usd": "0.0005", "voice_quality": "neural", "languages": ["en"], "voice_cloning": true, "deployment_options": ["native"], "confidence": "medium", "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.resemble.ai/pricing", "notes": "Resemble bills TTS per second of generated audio at a flat $0.0005/sec on their Flex (pay-as-you-go) plan; the rate is not split per model. Chatterbox Turbo is Resemble's flagship English TTS per https://www.resemble.ai (also open-sourced at https://github.com/resemble-ai/chatterbox, 'SoTA open-source TTS'). Confidence medium because the pricing page lists the rate against the service category 'Text-to-speech' rather than naming Chatterbox Turbo specifically, and the API's exact 'model' parameter slug was not verified against live docs. Multilingual ('Chatterbox Multilingual') and dramatic-read ('DramaBox') variants are marketed separately; modeled here as the English flagship only." } ] }