{ "version": 2, "license": "CC-BY-4.0", "models": [ { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-opus-4-7", "display_name": "Claude Opus 4.7", "model_family": "Claude 4", "knowledge_cutoff": "2026-01-31", "aliases": ["anthropic.claude-opus-4-7"], "context_window": 1000000, "max_output_tokens": 128000, "input_per_mtok_usd": "5.0", "output_per_mtok_usd": "25.0", "cache_read_per_mtok_usd": "0.5", "cache_write_per_mtok_usd": "6.25", "batch_input_per_mtok_usd": "2.5", "batch_output_per_mtok_usd": "12.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "bedrock", "vertex"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.anthropic.com/pricing", "notes": "Cache hit pricing is 0.1x base input ($0.50/MTok); 5-minute cache write is 1.25x base ($6.25/MTok); 1-hour cache write is 2x ($10/MTok). Batch API discounts both input and output by 50%. Knowledge cutoff Jan 2026. Uses a new tokenizer vs prior Claude models (may use up to 35% more tokens for identical text). Supports adaptive thinking (no extended-thinking toggle); thinking output tokens are billed at the output rate." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-sonnet-4-6", "display_name": "Claude Sonnet 4.6", "model_family": "Claude 4", "knowledge_cutoff": "2025-08-31", "aliases": ["anthropic.claude-sonnet-4-6"], "context_window": 1000000, "max_output_tokens": 64000, "input_per_mtok_usd": "3.0", "output_per_mtok_usd": "15.0", "cache_read_per_mtok_usd": "0.3", "cache_write_per_mtok_usd": "3.75", "batch_input_per_mtok_usd": "1.5", "batch_output_per_mtok_usd": "7.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "bedrock", "vertex"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.anthropic.com/pricing", "notes": "Cache hit pricing is 0.1x base input ($0.30/MTok); 5-minute cache write is 1.25x base ($3.75/MTok); 1-hour cache write is 2x ($6/MTok). Batch API discounts both input and output by 50%. Knowledge cutoff Aug 2025. Supports extended thinking and adaptive thinking; thinking output tokens are billed at the output rate." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-5", "display_name": "GPT-5", "model_family": "GPT-5", "knowledge_cutoff": "2024-09-30", "context_window": 400000, "max_output_tokens": 128000, "input_per_mtok_usd": "1.25", "output_per_mtok_usd": "10.0", "cache_read_per_mtok_usd": "0.125", "batch_input_per_mtok_usd": "0.625", "batch_output_per_mtok_usd": "5.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "azure"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-5", "notes": "Reasoning model with adjustable reasoning_effort; reasoning tokens are billed at the output rate. Context window 400k, max output 128k confirmed against developers.openai.com/api/docs/models/gpt-5 (previous row had max_output_tokens as low-confidence). Cached input at 10% of base ($0.125/MTok). Batch API at flat 50% off input and output. PDF input via the Files API; image input native; audio is NOT supported on this model_id. Knowledge cutoff Sept 2024. Not on the April 2026 deprecation list." }, { "provider": "Google", "provider_url": "https://deepmind.google", "model_id": "gemini-2.5-pro", "display_name": "Gemini 2.5 Pro", "model_family": "Gemini 2.5", "knowledge_cutoff": "2025-01-31", "context_window": 1048576, "max_output_tokens": 65536, "input_per_mtok_usd": "1.25", "output_per_mtok_usd": "10.0", "cache_read_per_mtok_usd": "0.125", "cache_storage_per_mtok_per_hour_usd": "4.50", "batch_input_per_mtok_usd": "0.625", "batch_output_per_mtok_usd": "5.0", "pricing_tiers": [ { "threshold_tokens": 200000, "input_per_mtok_usd": "2.5", "output_per_mtok_usd": "15.0", "cache_read_per_mtok_usd": "0.25" } ], "modalities": { "input": ["text", "image", "audio", "video"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": true, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "vertex"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://ai.google.dev/pricing", "notes": "Input pricing shown is for prompts <=200k tokens; prompts >200k tokens are billed per pricing_tiers; cached input also tiers at 200k (the >200k rate is captured in pricing_tiers[0].cache_read_per_mtok_usd). Context window and max_output_tokens confirmed via ai.google.dev/gemini-api/docs/models/gemini-2.5-pro. Thinking is always on and cannot be disabled; thinking tokens are billed at the output rate. Knowledge cutoff January 2025. Batch Mode discount is a flat 50% off input/output. Audio input is billed at the standard input rate of $1.25/MTok (no separate audio premium, unlike 2.5 Flash/Flash-Lite/2.0 Flash); audio_input_per_mtok_usd omitted. Explicit context caching storage is captured in cache_storage_per_mtok_per_hour_usd. Free tier (AI Studio) is published as per-minute RPM/TPM only, not per-day; free_tier omitted. PDF input via the Files API; image, audio, and video native." }, { "provider": "DeepSeek", "provider_url": "https://www.deepseek.com", "model_id": "deepseek-v4-flash", "display_name": "DeepSeek V4 Flash", "model_family": "DeepSeek V4", "context_window": 1048576, "max_output_tokens": 384000, "input_per_mtok_usd": "0.14", "output_per_mtok_usd": "0.28", "cache_read_per_mtok_usd": "0.0028", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "last_verified": "2026-05-18", "last_changed_at": "2026-05-05", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://api-docs.deepseek.com/quick_start/pricing", "notes": "MoE architecture: 284B total parameters, 13B activated. Replaces deepseek-chat (V3-era alias). Cached input price ($0.0028/MTok) is 1/50th of base input, effective after DeepSeek reduced cache hit rates on 2026-04-26. Supports both non-thinking and thinking (default) modes; thinking output is billed at the same output rate, so reasoning_tokens_billed: true. deepseek-chat and deepseek-reasoner legacy aliases are scheduled for discontinuation on 2026-07-24; until then they route to V4 Flash non-thinking and thinking modes respectively." }, { "provider": "DeepSeek", "provider_url": "https://www.deepseek.com", "model_id": "deepseek-v4-pro", "display_name": "DeepSeek V4 Pro", "model_family": "DeepSeek V4", "context_window": 1048576, "max_output_tokens": 384000, "input_per_mtok_usd": "1.74", "output_per_mtok_usd": "3.48", "cache_read_per_mtok_usd": "0.0145", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "last_verified": "2026-05-26", "last_changed_at": "2026-05-26", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://api-docs.deepseek.com/quick_start/pricing", "notes": "Pro-tier sibling to deepseek-v4-flash, positioned for higher-quality responses at lower concurrency (500 vs Flash's 2500). Structured prices ($1.74 / $3.48 per MTok; cache hit $0.0145) are the list rates published by DeepSeek; a 75% launch promotion is in effect until 2026-05-31 15:59 UTC, during which the effective billed rates are $0.435 input / $0.87 output / $0.003625 cache hit. The DeepSeek pricing page also mentions post-expiration rates of 1/4 the original — re-verify after the promo lapses to confirm which interpretation lands. Cache miss is billed at the base input rate. Supports both non-thinking and thinking modes with tool calls and JSON output; thinking output is billed at the output rate, so reasoning_tokens_billed: true. DeepSeek did not publish an explicit launch date; last_changed_at set to verification date." }, { "provider": "DeepSeek", "provider_url": "https://www.deepseek.com", "model_id": "deepseek-chat", "display_name": "DeepSeek V3", "model_family": "DeepSeek V3", "context_window": 1048576, "max_output_tokens": 384000, "input_per_mtok_usd": "0.14", "output_per_mtok_usd": "0.28", "cache_read_per_mtok_usd": "0.0028", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "deprecated_at": "2026-04-24", "replaced_by_model_id": "deepseek-v4-flash", "last_verified": "2026-05-18", "last_changed_at": "2026-04-24", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://api-docs.deepseek.com/quick_start/pricing", "notes": "Canonical API model_id for the DeepSeek V3 lineage (V3 launched 2024-12-26 as deepseek-chat; upgraded through V3-0324, V3.1, V3.1-Terminus, V3.2 by 2025-12-01). Deprecated 2026-04-24 when V4 launched; still callable until scheduled discontinuation 2026-07-24, currently routing to deepseek-v4-flash non-thinking mode (prices captured here reflect that routing). DeepSeek's pricing page no longer publishes V3-era historical rates; standalone deepseek-v3 model_id was never exposed by the API." }, { "provider": "DeepSeek", "provider_url": "https://www.deepseek.com", "model_id": "deepseek-reasoner", "display_name": "DeepSeek R1", "model_family": "DeepSeek R1", "context_window": 1048576, "max_output_tokens": 384000, "input_per_mtok_usd": "0.14", "output_per_mtok_usd": "0.28", "cache_read_per_mtok_usd": "0.0028", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "deprecated_at": "2026-04-24", "replaced_by_model_id": "deepseek-v4-flash", "last_verified": "2026-05-18", "last_changed_at": "2026-04-24", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://api-docs.deepseek.com/quick_start/pricing", "notes": "Canonical API model_id for the DeepSeek R1 reasoning lineage (R1 launched 2025-01-20 as deepseek-reasoner; R1-0528 update 2025-05-28). Deprecated 2026-04-24 when V4 launched; still callable until scheduled discontinuation 2026-07-24, currently routing to deepseek-v4-flash thinking mode (prices captured here reflect that routing). Reasoning output is billed at the standard output rate (reasoning_tokens_billed: true). DeepSeek's pricing page no longer publishes R1-era historical rates; standalone deepseek-r1 model_id was never exposed by the API." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-haiku-4-5-20251001", "display_name": "Claude Haiku 4.5", "model_family": "Claude 4", "knowledge_cutoff": "2025-02-28", "aliases": [ "claude-haiku-4-5", "anthropic.claude-haiku-4-5-20251001-v1:0", "claude-haiku-4-5@20251001" ], "context_window": 200000, "max_output_tokens": 64000, "input_per_mtok_usd": "1.0", "output_per_mtok_usd": "5.0", "cache_read_per_mtok_usd": "0.1", "cache_write_per_mtok_usd": "1.25", "batch_input_per_mtok_usd": "0.5", "batch_output_per_mtok_usd": "2.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "bedrock", "vertex"], "last_verified": "2026-05-17", "last_changed_at": "2025-10-01", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://platform.claude.com/docs/en/about-claude/pricing", "notes": "Anthropic's fastest model with near-frontier intelligence; positioned for high-volume agentic workloads. Cache hit is 0.1x base input ($0.10/MTok); 5-minute cache write is 1.25x ($1.25/MTok); 1-hour cache write is 2x ($2/MTok). Batch API discounts both input and output by 50%. Supports extended thinking; thinking output tokens are billed at the output rate. Reliable knowledge cutoff Feb 2025; training data cutoff Jul 2025." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-sonnet-4-5-20250929", "display_name": "Claude Sonnet 4.5", "model_family": "Claude 4", "knowledge_cutoff": "2025-01-31", "aliases": [ "claude-sonnet-4-5", "anthropic.claude-sonnet-4-5-20250929-v1:0", "claude-sonnet-4-5@20250929" ], "context_window": 200000, "max_output_tokens": 64000, "input_per_mtok_usd": "3.0", "output_per_mtok_usd": "15.0", "cache_read_per_mtok_usd": "0.3", "cache_write_per_mtok_usd": "3.75", "batch_input_per_mtok_usd": "1.5", "batch_output_per_mtok_usd": "7.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "bedrock", "vertex"], "last_verified": "2026-05-17", "last_changed_at": "2025-09-29", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://platform.claude.com/docs/en/about-claude/pricing", "notes": "Legacy listing in Anthropic's models overview but still active. Pricing identical to Sonnet 4.6, but 200k context window (vs 1M on Sonnet 4.6). Cache hit is 0.1x base input ($0.30/MTok); 5-minute cache write is 1.25x ($3.75/MTok); 1-hour cache write is 2x ($6/MTok). Batch API discounts both input and output by 50%. Supports extended thinking; thinking output tokens are billed at the output rate. Reliable knowledge cutoff Jan 2025." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-opus-4-5-20251101", "display_name": "Claude Opus 4.5", "model_family": "Claude 4", "knowledge_cutoff": "2025-05-31", "aliases": [ "claude-opus-4-5", "anthropic.claude-opus-4-5-20251101-v1:0", "claude-opus-4-5@20251101" ], "context_window": 200000, "max_output_tokens": 64000, "input_per_mtok_usd": "5.0", "output_per_mtok_usd": "25.0", "cache_read_per_mtok_usd": "0.5", "cache_write_per_mtok_usd": "6.25", "batch_input_per_mtok_usd": "2.5", "batch_output_per_mtok_usd": "12.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "bedrock", "vertex"], "last_verified": "2026-05-17", "last_changed_at": "2025-11-01", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://platform.claude.com/docs/en/about-claude/pricing", "notes": "Legacy listing in Anthropic's models overview but still active. Pricing identical to Opus 4.6/4.7, but 200k context window (vs 1M on 4.6/4.7) and 64k max output (vs 128k). Cache hit is 0.1x base input ($0.50/MTok); 5-minute cache write is 1.25x ($6.25/MTok); 1-hour cache write is 2x ($10/MTok). Batch API discounts both input and output by 50%. Supports extended thinking; thinking output tokens are billed at the output rate. Reliable knowledge cutoff May 2025." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-3-7-sonnet-20250219", "display_name": "Claude Sonnet 3.7", "model_family": "Claude 3.7", "knowledge_cutoff": "2024-10-31", "aliases": [ "claude-3-7-sonnet-latest", "anthropic.claude-3-7-sonnet-20250219-v1:0", "claude-3-7-sonnet@20250219" ], "context_window": 200000, "max_output_tokens": 64000, "input_per_mtok_usd": "3.0", "output_per_mtok_usd": "15.0", "cache_read_per_mtok_usd": "0.3", "cache_write_per_mtok_usd": "3.75", "batch_input_per_mtok_usd": "1.5", "batch_output_per_mtok_usd": "7.5", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["bedrock", "vertex"], "deprecated_at": "2025-10-28", "replaced_by_model_id": "claude-sonnet-4-6", "confidence": "medium", "last_verified": "2026-05-17", "last_changed_at": "2025-10-28", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://platform.claude.com/docs/en/about-claude/model-deprecations", "notes": "RETIRED on the Claude API on 2026-02-19; still available on Amazon Bedrock and Google Vertex AI under partner retirement schedules. Anthropic's first reasoning model with extended thinking; can output up to 64k tokens in thinking mode (128k with the output-128k-2025-02-19 beta header). Prices are no longer listed on Anthropic's current pricing page; values sourced from OpenRouter and pricepertoken.com (confidence: medium). Cache and batch pricing inferred from Anthropic's standard multipliers (1.25x 5-min write, 0.1x cache read, 0.5x batch)." }, { "provider": "Anthropic", "provider_url": "https://www.anthropic.com", "model_id": "claude-3-5-haiku-20241022", "display_name": "Claude Haiku 3.5", "model_family": "Claude 3.5", "knowledge_cutoff": "2024-07-31", "aliases": [ "claude-3-5-haiku-latest", "anthropic.claude-3-5-haiku-20241022-v1:0", "claude-3-5-haiku@20241022" ], "context_window": 200000, "max_output_tokens": 8192, "input_per_mtok_usd": "0.8", "output_per_mtok_usd": "4.0", "cache_read_per_mtok_usd": "0.08", "cache_write_per_mtok_usd": "1.0", "batch_input_per_mtok_usd": "0.4", "batch_output_per_mtok_usd": "2.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": false, "deployment_options": ["bedrock", "vertex"], "deprecated_at": "2025-12-19", "replaced_by_model_id": "claude-haiku-4-5-20251001", "confidence": "medium", "last_verified": "2026-05-17", "last_changed_at": "2025-12-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://platform.claude.com/docs/en/about-claude/pricing", "notes": "RETIRED on the Claude API on 2026-02-19; still listed on Anthropic's pricing page as available on Amazon Bedrock and Google Vertex AI only. No extended-thinking support. max_output_tokens=8192 sourced from Anthropic legacy model card and OpenRouter (confidence: medium — not present in current docs). Cache hit is 0.1x base input ($0.08/MTok); 5-minute cache write is 1.25x ($1/MTok); 1-hour cache write is 2x ($1.60/MTok). Batch API discounts both input and output by 50%." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-5-mini", "display_name": "GPT-5 mini", "model_family": "GPT-5", "knowledge_cutoff": "2024-05-31", "context_window": 400000, "max_output_tokens": 128000, "input_per_mtok_usd": "0.25", "output_per_mtok_usd": "2.0", "cache_read_per_mtok_usd": "0.025", "batch_input_per_mtok_usd": "0.125", "batch_output_per_mtok_usd": "1.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "azure"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-17", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-5-mini", "notes": "Faster, more cost-efficient GPT-5 variant for low-latency, high-volume workloads. Cached input at 10% of base ($0.025/MTok). Batch API at flat 50% off input and output. Reasoning model with adjustable reasoning_effort; reasoning tokens are billed at the output rate. PDF input via the Files API; image input native. Knowledge cutoff May 2024." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-5-nano", "display_name": "GPT-5 nano", "model_family": "GPT-5", "knowledge_cutoff": "2024-05-31", "context_window": 400000, "max_output_tokens": 128000, "input_per_mtok_usd": "0.05", "output_per_mtok_usd": "0.4", "cache_read_per_mtok_usd": "0.005", "batch_input_per_mtok_usd": "0.025", "batch_output_per_mtok_usd": "0.2", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": false, "deployment_options": ["native", "azure"], "confidence": "medium", "last_verified": "2026-05-17", "last_changed_at": "2026-05-17", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-5-nano", "notes": "Smallest GPT-5 variant; OpenAI model card lists 'Reasoning model: No' with 'Average' reasoning capability — reasoning_tokens_billed set to false on that basis (confidence: medium because other GPT-5 family members are reasoning models). Cached input at 10% of base ($0.005/MTok). Batch API at flat 50% off input and output. Knowledge cutoff May 2024." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-4.1", "display_name": "GPT-4.1", "model_family": "GPT-4.1", "knowledge_cutoff": "2024-06-01", "context_window": 1047576, "max_output_tokens": 32768, "input_per_mtok_usd": "2.0", "output_per_mtok_usd": "8.0", "cache_read_per_mtok_usd": "0.5", "batch_input_per_mtok_usd": "1.0", "batch_output_per_mtok_usd": "4.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": false, "deployment_options": ["native", "azure"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-17", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-4.1", "notes": "Non-reasoning flagship with a ~1M-token context window (1,047,576). Cached input at 25% of base ($0.50/MTok). Batch API at flat 50% off input and output. PDF input via the Files API; image input native. Knowledge cutoff June 2024." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-4o", "display_name": "GPT-4o", "model_family": "GPT-4o", "knowledge_cutoff": "2023-10-01", "context_window": 128000, "max_output_tokens": 16384, "input_per_mtok_usd": "2.5", "output_per_mtok_usd": "10.0", "cache_read_per_mtok_usd": "1.25", "batch_input_per_mtok_usd": "1.25", "batch_output_per_mtok_usd": "5.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": false, "deployment_options": ["native", "azure"], "deprecated_at": "2026-04-22", "replaced_by_model_id": "gpt-4.1", "last_verified": "2026-05-17", "last_changed_at": "2026-04-22", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-4o", "notes": "Deprecated 2026-04-22 (dated alias gpt-4o-2024-05-13 scheduled for shutdown 2026-10-23 per OpenAI deprecations page); still serving on the API as of 2026-05-17. Audio input/output are NOT supported on this model_id — they live on a sibling gpt-4o-audio-preview model card with separate pricing (audio input $40/MTok, audio output $80/MTok); text-mode prices captured here. Cached input at 50% of base ($1.25/MTok). Batch API at flat 50% off input and output. Knowledge cutoff Oct 2023." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "gpt-4o-mini", "display_name": "GPT-4o mini", "model_family": "GPT-4o", "knowledge_cutoff": "2023-10-01", "context_window": 128000, "max_output_tokens": 16384, "input_per_mtok_usd": "0.15", "output_per_mtok_usd": "0.6", "cache_read_per_mtok_usd": "0.075", "batch_input_per_mtok_usd": "0.075", "batch_output_per_mtok_usd": "0.3", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": false, "deployment_options": ["native", "azure"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-17", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/gpt-4o-mini", "notes": "Not on the April 2026 deprecation list; remains active. Audio input/output are NOT supported on this model_id — they live on a sibling gpt-4o-mini-audio-preview model card; text-mode prices captured here. Cached input at 50% of base ($0.075/MTok). Batch API at flat 50% off input and output. Knowledge cutoff Oct 2023." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "o3", "display_name": "OpenAI o3", "model_family": "o-series", "knowledge_cutoff": "2024-06-01", "context_window": 200000, "max_output_tokens": 100000, "input_per_mtok_usd": "2.0", "output_per_mtok_usd": "8.0", "cache_read_per_mtok_usd": "0.5", "batch_input_per_mtok_usd": "1.0", "batch_output_per_mtok_usd": "4.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "azure"], "last_verified": "2026-05-17", "last_changed_at": "2026-05-17", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/o3", "notes": "Reasoning model for complex tasks; reasoning tokens are billed at the output rate. Bare 'o3' alias remains active (dated o3-mini-2025-01-31 is deprecated but a separate model). OpenAI's model card notes 'o3 is succeeded by GPT-5' but does not list o3 itself as deprecated. Cached input at 25% of base ($0.50/MTok). Batch API at flat 50% off input and output. Knowledge cutoff June 2024." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "o4-mini", "display_name": "OpenAI o4-mini", "model_family": "o-series", "knowledge_cutoff": "2024-06-01", "context_window": 200000, "max_output_tokens": 100000, "input_per_mtok_usd": "1.1", "output_per_mtok_usd": "4.4", "cache_read_per_mtok_usd": "0.275", "batch_input_per_mtok_usd": "0.55", "batch_output_per_mtok_usd": "2.2", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "azure"], "deprecated_at": "2026-04-22", "replaced_by_model_id": "gpt-5-mini", "last_verified": "2026-05-17", "last_changed_at": "2026-04-22", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/o4-mini", "notes": "Fast, cost-efficient reasoning model; reasoning tokens are billed at the output rate. Deprecated 2026-04-22 (dated alias o4-mini-2025-04-16 scheduled for shutdown 2026-10-23 per OpenAI deprecations page); still serving as of 2026-05-17. OpenAI's model card notes 'succeeded by GPT-5 mini'. Cached input at 25% of base ($0.275/MTok). Batch API at flat 50% off input and output. Knowledge cutoff June 2024." }, { "provider": "OpenAI", "provider_url": "https://openai.com", "model_id": "o1", "display_name": "OpenAI o1", "model_family": "o-series", "knowledge_cutoff": "2023-10-01", "context_window": 200000, "max_output_tokens": 100000, "input_per_mtok_usd": "15.0", "output_per_mtok_usd": "60.0", "cache_read_per_mtok_usd": "7.5", "batch_input_per_mtok_usd": "7.5", "batch_output_per_mtok_usd": "30.0", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "azure"], "deprecated_at": "2026-04-22", "replaced_by_model_id": "o3", "last_verified": "2026-05-17", "last_changed_at": "2026-04-22", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://developers.openai.com/api/docs/models/o1", "notes": "First-generation reasoning model; reasoning tokens are billed at the output rate. Deprecated 2026-04-22 (dated alias o1-2024-12-17 scheduled for shutdown 2026-10-23 per OpenAI deprecations page); still serving as of 2026-05-17. OpenAI's recommended replacement on the developer community post is gpt-5.5; replaced_by_model_id set to in-file successor 'o3' (gpt-5.5 not yet in dataset). Cached input at 50% of base ($7.50/MTok). Batch API at flat 50% off input and output. Knowledge cutoff Oct 2023." }, { "provider": "Google", "provider_url": "https://deepmind.google", "model_id": "gemini-2.5-flash", "display_name": "Gemini 2.5 Flash", "model_family": "Gemini 2.5", "knowledge_cutoff": "2025-01-31", "context_window": 1048576, "max_output_tokens": 65536, "input_per_mtok_usd": "0.30", "output_per_mtok_usd": "2.50", "audio_input_per_mtok_usd": "1.00", "cache_read_per_mtok_usd": "0.03", "cache_storage_per_mtok_per_hour_usd": "1.00", "batch_input_per_mtok_usd": "0.15", "batch_output_per_mtok_usd": "1.25", "modalities": { "input": ["text", "image", "audio", "video"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": true, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "vertex"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://ai.google.dev/gemini-api/docs/pricing", "notes": "Hybrid reasoning model with dynamic thinking on by default; thinking can be disabled via thinkingBudget=0. When thinking is on, response pricing is the sum of output and thinking tokens (both billed at the output rate). Audio input is priced separately (captured in audio_input_per_mtok_usd); cached audio input is $0.10/MTok vs $0.03/MTok for text/image/video. Batch Mode at flat 50% off; audio batch input is $0.50/MTok. No long-context tier. Context window and max_output_tokens confirmed via ai.google.dev/gemini-api/docs/models/gemini-2.5-flash. Knowledge cutoff January 2025. Explicit context caching storage is captured in cache_storage_per_mtok_per_hour_usd. Free tier (AI Studio) is published as per-minute RPM/TPM only, not per-day; free_tier omitted. Cross-verified prompt/completion price against openrouter.ai/google/gemini-2.5-flash." }, { "provider": "Google", "provider_url": "https://deepmind.google", "model_id": "gemini-2.5-flash-lite", "display_name": "Gemini 2.5 Flash-Lite", "model_family": "Gemini 2.5", "knowledge_cutoff": "2025-01-31", "context_window": 1048576, "max_output_tokens": 65536, "input_per_mtok_usd": "0.10", "output_per_mtok_usd": "0.40", "audio_input_per_mtok_usd": "0.30", "cache_read_per_mtok_usd": "0.01", "cache_storage_per_mtok_per_hour_usd": "1.00", "batch_input_per_mtok_usd": "0.05", "batch_output_per_mtok_usd": "0.20", "modalities": { "input": ["text", "image", "audio", "video"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": true, "supports_audio_out": false, "supports_pdf": true, "reasoning_tokens_billed": true, "deployment_options": ["native", "vertex"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://ai.google.dev/gemini-api/docs/pricing", "notes": "Hybrid reasoning model; thinking is OFF by default (unlike 2.5 Flash/Pro) but can be enabled by setting thinkingBudget. When thinking is enabled, response pricing is the sum of output and thinking tokens at the output rate, so reasoning_tokens_billed is true. Audio input is priced separately (captured in audio_input_per_mtok_usd); cached audio input is $0.03/MTok vs $0.01/MTok for text/image/video. Batch Mode at flat 50% off; audio batch input is $0.15/MTok. No long-context tier. Context window and max_output_tokens confirmed via ai.google.dev/gemini-api/docs/models/gemini-2.5-flash-lite. Knowledge cutoff January 2025. Explicit context caching storage is captured in cache_storage_per_mtok_per_hour_usd. Free tier (AI Studio) is published as per-minute RPM/TPM only, not per-day; free_tier omitted. Cross-verified prompt/completion price against openrouter.ai/google/gemini-2.5-flash-lite." }, { "provider": "Google", "provider_url": "https://deepmind.google", "model_id": "gemini-2.0-flash", "display_name": "Gemini 2.0 Flash", "model_family": "Gemini 2.0", "knowledge_cutoff": "2024-08-31", "context_window": 1048576, "max_output_tokens": 8192, "input_per_mtok_usd": "0.10", "output_per_mtok_usd": "0.40", "audio_input_per_mtok_usd": "0.70", "cache_read_per_mtok_usd": "0.025", "cache_storage_per_mtok_per_hour_usd": "1.00", "batch_input_per_mtok_usd": "0.05", "batch_output_per_mtok_usd": "0.20", "modalities": { "input": ["text", "image", "audio", "video"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": true, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "vertex"], "deprecated_at": "2026-06-01", "replaced_by_model_id": "gemini-2.5-flash", "confidence": "medium", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://ai.google.dev/gemini-api/docs/pricing", "notes": "Deprecated; scheduled shutdown 2026-06-01. Google's documented migration target is a Gemini 3 Flash preview model not yet in this dataset; replaced_by_model_id points at gemini-2.5-flash as the closest in-file successor. Standard production 2.0 Flash does not support thinking (thinking exists only on Gemini 2.5+ and 3 series per ai.google.dev/gemini-api/docs/thinking); reasoning_tokens_billed=false. Audio input is priced separately (captured in audio_input_per_mtok_usd); cached audio input is $0.175/MTok vs $0.025/MTok for text/image/video. Batch Mode at flat 50% off. No long-context tier. supports_pdf=false since the 2.0 Flash model card lists supported inputs as audio/images/video/text (PDF not enumerated). Confidence medium because Vertex AI's published pricing for the same model name differs ($0.15 input / $0.60 output) from AI Studio's $0.10/$0.40; AI Studio primary value retained per spec, and OpenRouter (openrouter.ai/google/gemini-2.0-flash-001) cross-confirms $0.10/$0.40. Explicit context caching storage is captured in cache_storage_per_mtok_per_hour_usd. Free tier (AI Studio) is published as per-minute RPM/TPM only, not per-day; free_tier omitted." }, { "provider": "Meta", "provider_url": "https://www.llama.com", "model_id": "llama-4-maverick", "display_name": "Llama 4 Maverick", "model_family": "Llama 4", "knowledge_cutoff": "2024-08-31", "aliases": [ "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", "meta-llama/llama-4-maverick" ], "context_window": 1048576, "max_output_tokens": 8192, "input_per_mtok_usd": "0.27", "output_per_mtok_usd": "0.85", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["together"], "aggregators": ["openrouter"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct", "notes": "Multi-host pricing snapshot 2026-05-18: Together $0.27/$0.85 per MTok (meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8) is the row's structured price as the lowest direct-host rate. OpenRouter aggregator routes at $0.15/$0.60 (informational only; structured input/output stay at the lowest direct-host price per the PR4 convention) and is structured as aggregators[\"openrouter\"]. Groq does not list Maverick on its public pricing page; Fireworks does not offer Maverick on serverless (on-demand deployments only). Context window is Meta's published 1M (1048576 tokens); OpenRouter advertises 1.05M but the HuggingFace model card spec is 1M. max_output_tokens not published on the model card; defaulted to 8192. 17B activated / 400B total MoE with 128 experts. deployment_options[] omits hosts whose canonical slug or full pricing could not be confirmed in a single primary source on this date (Fireworks no-serverless, Groq absent)." }, { "provider": "Meta", "provider_url": "https://www.llama.com", "model_id": "llama-4-scout", "display_name": "Llama 4 Scout", "model_family": "Llama 4", "knowledge_cutoff": "2024-08-31", "aliases": [ "meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/llama-4-scout-17b-16e-instruct", "accounts/fireworks/models/llama4-scout-instruct-basic", "meta-llama/llama-4-scout" ], "context_window": 10485760, "max_output_tokens": 8192, "input_per_mtok_usd": "0.11", "output_per_mtok_usd": "0.34", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["together", "fireworks", "groq"], "aggregators": ["openrouter"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", "notes": "Multi-host pricing snapshot 2026-05-18: Together $0.18/$0.59 per MTok (meta-llama/Llama-4-Scout-17B-16E-Instruct); Fireworks $0.15/$0.60 (accounts/fireworks/models/llama4-scout-instruct-basic); Groq $0.11/$0.34 (meta-llama/llama-4-scout-17b-16e-instruct) is the lowest direct-host rate and is the row's structured price. OpenRouter aggregator routes at $0.08/$0.30 (informational only; structured input/output stay at the lowest direct-host price per the PR4 convention) and is structured as aggregators[\"openrouter\"]. Context window is Meta's published 10M (10485760 tokens); most hosts cap below Meta's spec (e.g. Groq, Fireworks typically expose ~128K-1M at the API). max_output_tokens not published on the model card; defaulted to 8192. 17B activated / 109B total MoE with 16 experts." }, { "provider": "Meta", "provider_url": "https://www.llama.com", "model_id": "llama-3.3-70b", "display_name": "Llama 3.3 70B Instruct", "model_family": "Llama 3.3", "knowledge_cutoff": "2023-12-31", "aliases": [ "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.3-70B-Instruct-Turbo", "llama-3.3-70b-versatile", "meta-llama/llama-3.3-70b-instruct" ], "context_window": 131072, "max_output_tokens": 8192, "input_per_mtok_usd": "0.59", "output_per_mtok_usd": "0.79", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["together", "groq"], "aggregators": ["openrouter"], "confidence": "high", "last_verified": "2026-05-26", "last_changed_at": "2026-05-18", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", "notes": "Multi-host pricing snapshot 2026-05-18: Together $0.88/$0.88 per MTok (meta-llama/Llama-3.3-70B-Instruct-Turbo); Groq $0.59/$0.79 (llama-3.3-70b-versatile) is the lowest direct-host rate and is the row's structured price. Fireworks publishes $0.90 input on accounts/fireworks/models/llama-v3p3-70b-instruct but output price was not captured from a single primary source on this date so Fireworks is omitted from deployment_options. OpenRouter aggregator routes at $0.10/$0.32 (informational only; structured input/output stay at the lowest direct-host price per the PR4 convention) and is structured as aggregators[\"openrouter\"]. Context window 128K per Meta's spec (131072 tokens). Text-only; no vision. max_output_tokens not published on the model card; defaulted to 8192. Knowledge cutoff December 2023 per model card." }, { "provider": "Mistral", "provider_url": "https://mistral.ai", "model_id": "mistral-large-2411", "display_name": "Mistral Large 2 (24.11)", "model_family": "Mistral Large", "aliases": [ "mistral-large-latest", "mistral-large-2407", "mistral.mistral-large-2407-v1:0" ], "context_window": 131072, "max_output_tokens": 8192, "input_per_mtok_usd": "2.00", "output_per_mtok_usd": "6.00", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "bedrock", "vertex", "azure"], "last_verified": "2026-05-18", "last_changed_at": "2024-11-19", "deprecated_at": "2026-02-27", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://mistral.ai/news/mistral-large-2407", "notes": "La Plateforme rates ($2.00 / $6.00 per MTok) are the row's structured price. Multi-host availability: Bedrock (mistral.mistral-large-2407-v1:0 in us-west-2), Vertex AI, Azure AI Foundry, IBM watsonx. Bedrock published the 24.07 build only, not 24.11. Deprecated on La Plateforme 2026-02-27 (retirement 2026-05-31 per Mistral's legacy table); `mistral-large-latest` now resolves to Mistral Large 3 (not in this dataset). Text-only; no vision (Pixtral Large is the multimodal sibling). max_output_tokens not published on the model card; defaulted to 8192. Batch API is a 50% discount where available but per-model availability is not confirmed from a single primary source on this date, so batch fields are unset. Knowledge cutoff not published by Mistral." }, { "provider": "Mistral", "provider_url": "https://mistral.ai", "model_id": "mistral-medium-2505", "display_name": "Mistral Medium 3", "model_family": "Mistral Medium", "aliases": ["mistral-medium-latest"], "context_window": 131072, "max_output_tokens": 8192, "input_per_mtok_usd": "0.40", "output_per_mtok_usd": "2.00", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "last_verified": "2026-05-18", "last_changed_at": "2025-05-07", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://mistral.ai/news/mistral-medium-3", "notes": "La Plateforme rates ($0.40 / $2.00 per MTok) are the row's structured price. Mistral's launch post (2025-05-07) lists La Plateforme and Amazon SageMaker at GA with IBM watsonx, NVIDIA NIM, Azure AI Foundry, and Google Cloud Vertex as forthcoming; SageMaker is not in the deployment_options enum and Bedrock has not been confirmed, so deployment_options is restricted to native. Optimized for agentic and coding use cases. max_output_tokens not published on the model card; defaulted to 8192. Batch API discount per-model availability not confirmed on this date, so batch fields are unset. Knowledge cutoff not published by Mistral." }, { "provider": "Mistral", "provider_url": "https://mistral.ai", "model_id": "mistral-small-2501", "display_name": "Mistral Small 3", "model_family": "Mistral Small", "aliases": ["mistralai/Mistral-Small-24B-Instruct-2501"], "context_window": 32768, "max_output_tokens": 8192, "input_per_mtok_usd": "0.10", "output_per_mtok_usd": "0.30", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "last_verified": "2026-05-18", "last_changed_at": "2025-01-30", "deprecated_at": "2025-11-06", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://mistral.ai/news/mistral-small-3", "notes": "La Plateforme rates ($0.10 / $0.30 per MTok) are the row's structured price; per Mistral's launch post, half the price of the previous mistral-small ($0.20 / $0.60). 24B-parameter latency-optimized model under Apache 2.0; text-only. Context window 32K per Mistral's spec (33000 tokens rounded; 32768 used here). Deprecated on La Plateforme 2025-11-06 and retired 2025-11-30 per Mistral's legacy table; successor (mistral-small-2503 / 3.1 with vision and 128K context, and later Small 3.x builds) not yet in this dataset. max_output_tokens not published on the model card; defaulted to 8192. Batch API discount per-model availability not confirmed on this date, so batch fields are unset. Knowledge cutoff not published by Mistral." }, { "provider": "Mistral", "provider_url": "https://mistral.ai", "model_id": "codestral-2508", "display_name": "Codestral 25.08", "model_family": "Codestral", "aliases": ["codestral-latest", "codestral-2"], "context_window": 262144, "max_output_tokens": 8192, "input_per_mtok_usd": "0.30", "output_per_mtok_usd": "0.90", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "vertex"], "last_verified": "2026-05-18", "last_changed_at": "2025-07-31", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://mistral.ai/news/codestral-25-08", "notes": "La Plateforme rates ($0.30 / $0.90 per MTok) are the row's structured price. Code-specialized model optimized for fill-in-the-middle (FIM), code completion, code correction, and test generation; supports tool use and structured output per the 25.08 release. 256K context (262144 tokens). Also available on Google Cloud Vertex AI Model Garden as `codestral-2` under the `mistralai` publisher (Mistral Docs: Vertex AI cloud deployments page). max_output_tokens not published on the model card; defaulted to 8192. Batch API discount per-model availability not confirmed on this date, so batch fields are unset. Knowledge cutoff not published by Mistral." }, { "provider": "Mistral", "provider_url": "https://mistral.ai", "model_id": "pixtral-large-2411", "display_name": "Pixtral Large", "model_family": "Pixtral", "aliases": ["pixtral-large-latest", "mistral.pixtral-large-2502-v1:0"], "context_window": 131072, "max_output_tokens": 8192, "input_per_mtok_usd": "2.00", "output_per_mtok_usd": "6.00", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "bedrock"], "last_verified": "2026-05-18", "last_changed_at": "2024-11-18", "deprecated_at": "2026-02-27", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://mistral.ai/news/pixtral-large", "notes": "La Plateforme rates ($2.00 / $6.00 per MTok) are the row's structured price; pricing parity with Mistral Large 2 since Pixtral Large is the multimodal 124B-parameter open-weight model built on top of Mistral Large 2. Vision-capable: handles documents, charts, and natural images alongside text. Context window 128K (131072 tokens). Bedrock publishes the 25.02 refresh (`mistral.pixtral-large-2502-v1:0`, also routed via `us.mistral.pixtral-large-2502-v1:0`), not the 24.11 build. Deprecated on La Plateforme 2026-02-27 (retirement 2026-05-31 per Mistral's legacy table); Mistral's own news page now carries a \"this model is deprecated\" banner. Successor multimodal capability is absorbed by Mistral Large 3 (not in this dataset). max_output_tokens not published on the model card; defaulted to 8192. Vertex/Azure availability not confirmed for Pixtral Large on this date. Batch API discount per-model availability not confirmed on this date, so batch fields are unset. Knowledge cutoff not published by Mistral." }, { "provider": "Cohere", "provider_url": "https://cohere.com", "model_id": "command-a-03-2025", "display_name": "Command A", "model_family": "Command A", "aliases": ["cohere.command-a-03-2025"], "context_window": 256000, "max_output_tokens": 8000, "input_per_mtok_usd": "2.50", "output_per_mtok_usd": "10.00", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "azure"], "confidence": "medium", "last_verified": "2026-05-18", "last_changed_at": "2025-03-01", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.cohere.com/docs/command-a", "notes": "Cohere's flagship 111B-parameter model: 256K context, text-only, optimized for tool use, RAG, agents, and 23-language multilingual workloads. Price ($2.50 / $10.00 per MTok) per artificialanalysis.ai citing Cohere's API; Command A is not listed on cohere.com/pricing as of 2026-05-18 (the public table still shows the older Command R/R+ tier), so confidence is medium. Cohere docs note AWS Bedrock availability as \"Coming Soon\" (no Bedrock SKU yet, so `bedrock` is omitted from deployment_options); Azure AI Foundry availability is published but uses per-deployment IDs, so no Azure alias is encoded. Oracle OCI exposes it as `cohere.command-a-03-2025` (kept as alias). Cache and batch pricing not published by Cohere. Knowledge cutoff not published on the Cohere model card." }, { "provider": "Cohere", "provider_url": "https://cohere.com", "model_id": "command-r-plus-08-2024", "display_name": "Command R+", "model_family": "Command R", "knowledge_cutoff": "2024-03-31", "aliases": ["command-r-plus", "cohere.command-r-plus-v1:0"], "context_window": 128000, "max_output_tokens": 4000, "input_per_mtok_usd": "2.50", "output_per_mtok_usd": "10.00", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native", "bedrock", "azure"], "last_verified": "2026-05-18", "last_changed_at": "2024-08-30", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://cohere.com/pricing", "notes": "Cohere Platform rates ($2.50 / $10.00 per MTok) are the row's structured price, listed on cohere.com/pricing as \"Command R+ 08-2024\". 128K context, text-only, optimized for complex RAG and multi-step tool use. Cohere's deprecations page sunsetted only the predecessor `command-r-plus-04-2024` on 2025-09-15 and names this 08-2024 build as the recommended replacement, so it is active on Cohere Platform. Bedrock SKU `cohere.command-r-plus-v1:0` launched Aug 2024 with a Mar 2024 knowledge cutoff (per Bedrock model card), matching this row; Bedrock marks the model \"Legacy\" with an EOL of 2026-08-19, which is a Bedrock-side lifecycle marker, not a Cohere deprecation. Azure AI Foundry availability published by Cohere; per-deployment IDs there, so no Azure alias is encoded. Cache and batch pricing not published by Cohere." }, { "provider": "xAI", "provider_url": "https://x.ai", "model_id": "grok-4-0709", "display_name": "Grok 4", "model_family": "Grok 4", "knowledge_cutoff": "2024-11-30", "aliases": ["grok-4"], "context_window": 256000, "max_output_tokens": 256000, "input_per_mtok_usd": "3.00", "output_per_mtok_usd": "15.00", "cache_read_per_mtok_usd": "0.75", "pricing_tiers": [ { "threshold_tokens": 128000, "input_per_mtok_usd": "6.00", "output_per_mtok_usd": "30.00" } ], "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "deprecated_at": "2026-05-15", "replaced_by_model_id": "grok-4.3", "last_verified": "2026-05-19", "last_changed_at": "2026-05-15", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.x.ai/developers/migration/may-15-retirement", "notes": "xAI native rates ($3.00 / $15.00 per MTok, $0.75 cached input) are the row's structured price; prompts above 128K total tokens are billed at the higher pricing_tiers rate ($6.00 / $30.00) per xAI's documented long-context tiering. Grok 4 (snapshot `grok-4-0709`, released 2025-07-09) was xAI's flagship reasoning model: reasoning is always on (thinking tokens billed at the output rate, hence reasoning_tokens_billed: true), parallel tool calling and structured outputs supported, accepts text and image inputs. max_output_tokens of 256000 reflects xAI's documented \"up to 256K tokens of output\" within the shared 256K prompt+response context. Retired from the xAI API on 2026-05-15 12:00 PM PT alongside seven other legacy slugs; requests to `grok-4-0709` and `grok-4` continue to resolve but are now redirected to `grok-4.3` with `low` reasoning effort and billed at grok-4.3 rates. Successor is `grok-4.3`, captured in this dataset and referenced via `replaced_by_model_id`. xAI's API is native-only (not on Bedrock/Vertex/Azure). Batch API not published for this model." }, { "provider": "xAI", "provider_url": "https://x.ai", "model_id": "grok-3", "display_name": "Grok 3", "model_family": "Grok 3", "knowledge_cutoff": "2024-11-30", "context_window": 131072, "max_output_tokens": 131072, "input_per_mtok_usd": "3.00", "output_per_mtok_usd": "15.00", "cache_read_per_mtok_usd": "0.75", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "deprecated_at": "2026-05-15", "replaced_by_model_id": "grok-4.3", "last_verified": "2026-05-19", "last_changed_at": "2026-05-15", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.x.ai/developers/migration/may-15-retirement", "notes": "xAI native rates ($3.00 / $15.00 per MTok, $0.75 cached input) are the row's structured price (xAI's pricing page and mem0/pricepertoken aggregator both report $3/$15; artificialanalysis.ai reports a higher $4/$20 — choosing xAI-aligned figures). Grok 3 (released 2025-02-19) was xAI's flagship non-reasoning chat model; text-only inputs, function calling and structured outputs supported, 131,072-token combined prompt+response context window. Not a reasoning model (direct responses, no extended chain-of-thought; the reasoning sibling was `grok-3-mini`, not in this dataset). max_output_tokens defaulted to the documented context cap; xAI does not publish a separate max-output limit beyond the shared 131,072-token window. Retired from the xAI API on 2026-05-15 12:00 PM PT; requests to `grok-3` continue to resolve but are now redirected to `grok-4.3` with `none` reasoning effort and billed at grok-4.3 rates. Successor is `grok-4.3`, captured in this dataset and referenced via `replaced_by_model_id`. xAI's API is native-only. Batch API not published for this model." }, { "provider": "xAI", "provider_url": "https://x.ai", "model_id": "grok-code-fast-1", "display_name": "Grok Code Fast 1", "model_family": "Grok Code", "context_window": 256000, "max_output_tokens": 256000, "input_per_mtok_usd": "0.20", "output_per_mtok_usd": "1.50", "cache_read_per_mtok_usd": "0.02", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "deprecated_at": "2026-05-15", "replaced_by_model_id": "grok-4.3", "last_verified": "2026-05-19", "last_changed_at": "2026-05-15", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.x.ai/developers/migration/may-15-retirement", "notes": "xAI native rates ($0.20 / $1.50 per MTok, $0.02 cached input) are the row's structured price. Grok Code Fast 1 (released 2025-08-26) was xAI's speedy, economical coding-specialized reasoning model: 314B-parameter MoE architecture, 256K combined prompt+response context, agentic coding focus, visible reasoning traces (`reasoning_content` field in streaming responses), function calling and structured outputs supported, text-only. Reasoning is enabled by default so reasoning tokens are billed at the output rate. max_output_tokens defaulted to the documented 256K context cap; xAI does not publish a separate max-output limit beyond the shared window. Retired from the xAI API on 2026-05-15 12:00 PM PT; requests to `grok-code-fast-1` continue to resolve but are now redirected to `grok-4.3` with `low` reasoning effort and billed at grok-4.3 rates. Successor is `grok-4.3`, captured in this dataset and referenced via `replaced_by_model_id`. xAI's API is native-only. Batch API not published for this model. Knowledge cutoff not published by xAI." }, { "provider": "xAI", "provider_url": "https://x.ai", "model_id": "grok-4.3", "display_name": "Grok 4.3", "model_family": "Grok 4", "context_window": 1000000, "max_output_tokens": 1000000, "input_per_mtok_usd": "1.25", "output_per_mtok_usd": "2.50", "modalities": { "input": ["text", "image"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": true, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-15", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.x.ai/docs/models", "notes": "xAI native rates ($1.25 / $2.50 per MTok) are the row's structured price, listed on docs.x.ai/docs/models and docs.x.ai/docs/pricing. Grok 4.3 is xAI's current flagship: positioned as \"the most intelligent and fastest model\" recommended for chat, coding, and general use across the Grok API. 1M-token combined prompt+response context window (a 4x expansion over the 256K window on Grok 4 / Grok Code Fast 1). Successor to `grok-4-0709`, `grok-3`, and `grok-code-fast-1`, all of which xAI retired on 2026-05-15 12:00 PM PT and now redirect to this SKU at varying default `reasoning_effort` levels (`low` for grok-4-0709 / grok-code-fast-1, `none` for grok-3). Thinking mode is exposed via the `reasoning_effort` parameter (`none` / `low` / `medium` / `high`); when reasoning is on, thinking tokens are billed at the output rate, hence reasoning_tokens_billed: true. Accepts text and image inputs, text output; parallel tool calling and structured outputs supported. max_output_tokens reflects the documented 1M-token shared window — xAI does not publish a separate max-output limit. Cached input price, knowledge cutoff, and release date are not published by xAI on the models or pricing pages as of last_verified; omitted rather than guessed. xAI's API is native-only (not on Bedrock / Vertex / Azure). Batch API not published for this model." }, { "provider": "Alibaba", "provider_url": "https://www.alibabacloud.com/product/modelstudio", "model_id": "qwen3-max", "display_name": "Qwen3-Max", "model_family": "Qwen 3 Max", "knowledge_cutoff": "2025-06-30", "context_window": 262144, "max_output_tokens": 32768, "input_per_mtok_usd": "1.20", "output_per_mtok_usd": "6.00", "batch_input_per_mtok_usd": "0.60", "batch_output_per_mtok_usd": "3.00", "pricing_tiers": [ { "threshold_tokens": 32768, "input_per_mtok_usd": "2.40", "output_per_mtok_usd": "12.00" }, { "threshold_tokens": 131072, "input_per_mtok_usd": "3.00", "output_per_mtok_usd": "15.00" } ], "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["dashscope"], "last_verified": "2026-05-19", "last_changed_at": "2025-09-23", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.alibabacloud.com/help/en/model-studio/model-pricing", "notes": "Alibaba DashScope (International) tiered pricing by input-token bucket: 0-32K = $1.20 input / $6.00 output per MTok (base row rate); 32K-128K = $2.40 / $12.00; 128K-252K = $3.00 / $15.00 (captured in pricing_tiers). Batch API discounts both input and output by 50% ($0.60 / $3.00 at the base tier). Context window 262,144 tokens (DashScope publishes 252K as the top-tier pricing ceiling; Qwen team and OpenRouter publish the full 262,144 model context). Max output 32,768 tokens. Released 2025-09-23 (`qwen3-max-2025-09-23`); knowledge cutoff 2025-06-30. Hybrid thinking model: thinking mode disabled by default but available via `/think` (and disabled via `/no_think`); when enabled, thinking tokens are billed at the output rate, hence reasoning_tokens_billed: true. Text-only inputs and outputs (the Qwen3-VL family is a separate set of model_ids). Tool calling and structured outputs supported via the DashScope and OpenAI-compatible endpoints. Explicit context cache discounts cached input tokens to 10% of the standard rate, but DashScope does not publish a single cache_read figure across the tiered input rates, so cache_read_per_mtok_usd is omitted rather than guessed. Deployment via DashScope (Model Studio) only at last_verified; not on Bedrock, Vertex, Together, Fireworks, or Groq as a first-party offering." }, { "provider": "Alibaba", "provider_url": "https://www.alibabacloud.com/product/modelstudio", "model_id": "qwen3-coder-plus", "display_name": "Qwen3-Coder-Plus", "model_family": "Qwen 3 Coder", "context_window": 1048576, "max_output_tokens": 65536, "input_per_mtok_usd": "1.00", "output_per_mtok_usd": "5.00", "batch_input_per_mtok_usd": "0.50", "batch_output_per_mtok_usd": "2.50", "pricing_tiers": [ { "threshold_tokens": 32768, "input_per_mtok_usd": "1.80", "output_per_mtok_usd": "9.00" }, { "threshold_tokens": 131072, "input_per_mtok_usd": "3.00", "output_per_mtok_usd": "15.00" }, { "threshold_tokens": 262144, "input_per_mtok_usd": "6.00", "output_per_mtok_usd": "60.00" } ], "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": true, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["dashscope"], "last_verified": "2026-05-19", "last_changed_at": "2025-09-23", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://www.alibabacloud.com/help/en/model-studio/model-pricing", "notes": "Alibaba DashScope (International) tiered pricing by input-token bucket: 0-32K = $1.00 / $5.00 per MTok (base row rate); 32K-128K = $1.80 / $9.00; 128K-256K = $3.00 / $15.00; 256K-1M = $6.00 / $60.00 (captured in pricing_tiers). Batch API discounts both input and output by 50% ($0.50 / $2.50 at the base tier). 1,000,000-token context window with 65,536 max output tokens. Released 2025-09-23 (`qwen3-coder-plus-2025-09-23`). Built on the Qwen3-Coder 480B-A35B MoE base; positioned for agentic coding (robust tool calling and environment interaction). Not a thinking/reasoning SKU (no chain-of-thought billing semantics), so reasoning_tokens_billed is false. Text-only modalities. Explicit context cache discounts cached input to 10% of the standard rate; implicit cache to 20%; DashScope does not publish a single cache_read figure across tiered input rates, so cache_read_per_mtok_usd is omitted rather than guessed. Knowledge cutoff not published. Deployment via DashScope (Model Studio) only at last_verified." }, { "provider": "Perplexity", "provider_url": "https://www.perplexity.ai", "model_id": "sonar", "display_name": "Sonar", "model_family": "Sonar", "context_window": 128000, "max_output_tokens": 8000, "input_per_mtok_usd": "1.00", "output_per_mtok_usd": "1.00", "per_request_usd": "0.005", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": false, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.perplexity.ai/getting-started/pricing", "notes": "Perplexity native rates: $1.00 input / $1.00 output per MTok. Web search is built into the API as a first-class capability rather than a user-defined tool, so total cost per query = token costs + a per-request fee. per_request_usd captures the low-context tier ($5 / 1,000 requests = $0.005); medium and high search-context tiers add $8 / $12 per 1,000 requests respectively (not captured structurally — single-value field). Perplexity does not publish a separate per-search fee for this SKU (per-search metering applies to `sonar-deep-research`), so per_search_usd is omitted. 128,000-token context window; max_output_tokens 8,000 per Perplexity's documented Sonar limits. Sonar (released January 2025) is built on a fine-tuned Llama 3.3 70B base optimized for web-grounded question answering with inline citations. Knowledge cutoff is intentionally omitted: Sonar fetches the live web at query time, so a static cutoff date does not meaningfully describe its answer space. Not a reasoning SKU (no chain-of-thought tokens), hence reasoning_tokens_billed is false. supports_tool_use is set conservatively to false because web search — the model's primary capability — is exposed as a built-in feature of the Sonar endpoint, not a user-defined tool; Perplexity recommends its separate Agent API for production tool-using agents. Structured outputs supported via response_format. Perplexity API is native-only (no Bedrock / Vertex / Azure / Together / Fireworks / Groq first-party deployment). Cache and batch APIs are not published." }, { "provider": "Perplexity", "provider_url": "https://www.perplexity.ai", "model_id": "sonar-pro", "display_name": "Sonar Pro", "model_family": "Sonar", "context_window": 200000, "max_output_tokens": 8000, "input_per_mtok_usd": "3.00", "output_per_mtok_usd": "15.00", "per_request_usd": "0.006", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": false, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": false, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.perplexity.ai/getting-started/pricing", "notes": "Perplexity native rates: $3.00 input / $15.00 output per MTok. Web search is built into the API, not exposed as a user-defined tool; total cost per query = token costs + a per-request fee. per_request_usd captures the low-context tier ($6 / 1,000 requests = $0.006); medium and high search-context tiers add $10 / $14 per 1,000 requests respectively (not captured structurally — single-value field). 200,000-token context window (largest of the Sonar family); max_output_tokens 8,000 per Perplexity's documented Sonar limits. Positioned as Perplexity's advanced search SKU for complex multi-source queries and follow-ups. Knowledge cutoff intentionally omitted: Sonar Pro fetches the live web at query time. Not a reasoning SKU, hence reasoning_tokens_billed is false. supports_tool_use set conservatively to false because Perplexity exposes web search as the built-in capability and recommends the separate Agent API for production tool-using agents. Structured outputs supported via response_format. Perplexity API is native-only (no Bedrock / Vertex / Azure / Together / Fireworks / Groq first-party deployment). Cache and batch APIs are not published." }, { "provider": "Perplexity", "provider_url": "https://www.perplexity.ai", "model_id": "sonar-reasoning-pro", "display_name": "Sonar Reasoning Pro", "model_family": "Sonar Reasoning", "context_window": 128000, "max_output_tokens": 8000, "input_per_mtok_usd": "2.00", "output_per_mtok_usd": "8.00", "per_request_usd": "0.006", "modalities": { "input": ["text"], "output": ["text"] }, "supports_tool_use": false, "structured_output": true, "supports_vision": false, "supports_audio_in": false, "supports_audio_out": false, "supports_pdf": false, "reasoning_tokens_billed": true, "deployment_options": ["native"], "last_verified": "2026-05-19", "last_changed_at": "2026-05-19", "verification_method": "manual-confirmed", "verified_by": "r13i", "source_url": "https://docs.perplexity.ai/getting-started/pricing", "notes": "Perplexity native rates: $2.00 input / $8.00 output per MTok. Web search is built into the API; total cost per query = token costs + a per-request fee. per_request_usd captures the low-context tier ($6 / 1,000 requests = $0.006); medium and high search-context tiers add $10 / $14 per 1,000 requests respectively (not captured structurally — single-value field). 128,000-token context window; max_output_tokens 8,000 per Perplexity's documented Sonar limits. Reasoning SKU built on DeepSeek R1 with Chain-of-Thought; responses include a leading `` reasoning block followed by the answer, and those reasoning tokens are billed at the output rate, hence reasoning_tokens_billed: true. Knowledge cutoff intentionally omitted: Sonar Reasoning Pro fetches the live web at query time. supports_tool_use set conservatively to false because Perplexity exposes web search as the built-in capability and recommends the separate Agent API for production tool-using agents. Structured outputs supported via response_format. Perplexity API is native-only (no Bedrock / Vertex / Azure / Together / Fireworks / Groq first-party deployment). Cache and batch APIs are not published. Note: the older `sonar-reasoning` SKU is no longer listed in Perplexity's current model lineup at last_verified." } ] }