{ "timestamp_utc": "2025-08-24 20:06:42", "benchmarks": [ { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 87.8, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 18.05 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 87.6, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 27.67 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5", "company": "Openai", "accuracy": 87.0, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 32.52 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 86.1, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 10.41 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "o3", "company": "Openai", "accuracy": 85.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 16.74 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Grok 4", "company": "xAI", "accuracy": 85.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 89.25 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 84.1, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 18.46 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 83.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 47.09 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "o1", "company": "Openai", "accuracy": 83.5, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 26.87 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 82.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 31.74 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 82.5, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 16.17 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 81.4, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 16.89 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 80.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 6.3 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "o4 Mini", "company": "Openai", "accuracy": 80.6, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 10.44 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1", "company": "Openai", "accuracy": 80.5, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 8.18 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 80.4, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 52.35 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 80.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 7.43 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3", "company": "xAI", "accuracy": 79.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 12.1 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 79.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 9.71 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 79.4, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 6.8 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 79.0, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 35.41 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 78.9, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 25.05 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "o3 Mini", "company": "Openai", "accuracy": 78.7, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 22.38 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 78.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 6.29 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 77.9, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 24.28 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 77.4, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 4.32 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 77.2, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 3.83 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Grok 2", "company": "xAI", "accuracy": 75.5, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 8.75 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 75.3, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 3.34 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 74.4, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 8.83 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 74.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 9.0 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 73.8, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 11.36 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 71.1, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 27.28 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 69.9, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 4.18 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 69.7, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 7.19 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 69.6, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 5.34 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 69.1, "cost_input": "N", "cost_output": "A", "latency": 36.35 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 69.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 9.14 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 67.7, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 30.21 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 67.0, "cost_input": "N", "cost_output": "A", "latency": 17.45 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 66.0, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 3.6 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 65.5, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 1.68 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Command A", "company": "Cohere", "accuracy": 65.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 9.85 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 64.4, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 4.66 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 64.1, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 5.79 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 62.7, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 5.09 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 62.3, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 2.4 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 49.3, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 9.48 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Command R+", "company": "Cohere", "accuracy": 43.9, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 4.42 }, { "benchmark": "mmlu_pro-08-12-2025", "benchmark_group": "Academic", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 30.2, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 2.43 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Grok 4", "company": "xAI", "accuracy": 26.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 4265.88 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "GPT 5", "company": "Openai", "accuracy": 20.0, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 2608.25 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Gemini 2.5 Pro", "company": "Google", "accuracy": 17.1, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 2964.8 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 15.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 6124.9 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 6.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 3253.12 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "o4 Mini", "company": "Openai", "accuracy": 5.3, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 480.19 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 4.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 2216.29 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Gemini 2.5 Flash (Nonthinking)", "company": "Google", "accuracy": 3.9, "cost_input": "$0.30", "cost_output": "$2.50", "latency": 350.19 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 1.7, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 171.48 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 1.3, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 1551.1 }, { "benchmark": "IOI_2025_08_11", "benchmark_group": "Coding", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 0.0, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 352.57 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "GPT 5", "company": "Openai", "accuracy": 84.6, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 14.75 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 83.6, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 3.51 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Grok 4", "company": "xAI", "accuracy": 83.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 24.22 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 82.8, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 0.43 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "o3", "company": "Openai", "accuracy": 82.5, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 5.14 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 82.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 4.92 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Grok 3", "company": "xAI", "accuracy": 82.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 0.44 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "GPT 4.1", "company": "Openai", "accuracy": 81.9, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 0.42 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 81.9, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 2.53 }, { "benchmark": "legal_bench-08-12-2025", "benchmark_group": "Legal", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 81.8, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 2.66 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 5", "company": "Openai", "accuracy": 81.5, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 72.32 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 81.3, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 24.15 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "o3", "company": "Openai", "accuracy": 80.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 39.27 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "o4 Mini", "company": "Openai", "accuracy": 79.7, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 19.72 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 78.9, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 26.7 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 77.5, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 2.45 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "o1", "company": "Openai", "accuracy": 77.4, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 26.41 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Grok 4", "company": "xAI", "accuracy": 76.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 102.89 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 75.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 69.58 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 74.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 48.85 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 74.0, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 26.95 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 73.3, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 14.02 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4.1", "company": "Openai", "accuracy": 72.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 12.0 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 72.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 10.82 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 71.7, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 7.91 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 71.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 8.32 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 70.9, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 30.78 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 70.5, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 8.23 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 69.8, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 45.22 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 68.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 46.01 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 65.5, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 43.73 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 64.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 47.53 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 63.0, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 12.56 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 62.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 45.87 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 60.0, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 13.4 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 58.8, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 10.51 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Grok 2 Vision", "company": "xAI", "accuracy": 57.3, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 18.36 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 57.2, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 41.31 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 56.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 45.02 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 55.1, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 7.25 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Llama 3.2 Vision (90B)", "company": "Meta", "accuracy": 48.1, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 56.34 }, { "benchmark": "mmmu-08-08-2025", "benchmark_group": "Academic", "model": "Llama 3.2 Vision (11B)", "company": "Meta", "accuracy": 38.8, "cost_input": "$0.18", "cost_output": "$0.18", "latency": 38.07 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 80.1, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 36.78 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "o3", "company": "Openai", "accuracy": 79.0, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 25.18 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3", "company": "xAI", "accuracy": 78.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 13.11 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "o4 Mini", "company": "Openai", "accuracy": 78.8, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 15.25 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "o1", "company": "Openai", "accuracy": 78.6, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 19.19 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 78.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 43.26 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1", "company": "Openai", "accuracy": 78.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 8.03 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 5", "company": "Openai", "accuracy": 78.3, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 79.88 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 78.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 5.83 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 77.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 29.41 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 76.7, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 22.04 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 76.7, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 162.88 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 76.5, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 11.74 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 76.3, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 51.41 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 75.9, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 32.23 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 75.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 31.99 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 75.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 6.44 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 75.8, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 15.32 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 75.7, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 24.96 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 75.4, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 5.46 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 75.2, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 31.71 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 75.0, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 5.91 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 75.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 9.54 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 74.4, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 15.37 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 74.4, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 94.33 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 74.1, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 12.96 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "o3 Mini", "company": "Openai", "accuracy": 73.8, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 81.08 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 73.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.94 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 73.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 12.18 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash Thinking Exp", "company": "Google", "accuracy": 73.4, "cost_input": "$0.10", "cost_output": "$0.70", "latency": 12.8 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 73.4, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 11.27 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 72.8, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 66.14 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash Exp", "company": "Google", "accuracy": 72.3, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 7.51 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 72.0, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 25.76 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 2", "company": "xAI", "accuracy": 71.4, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 10.63 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Pro Exp", "company": "Google", "accuracy": 70.8, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 9.14 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 70.3, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 5.72 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 69.3, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 28.18 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 68.7, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 52.24 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 67.7, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 12.0 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 4", "company": "xAI", "accuracy": 67.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 164.42 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 67.1, "cost_input": "N", "cost_output": "A", "latency": 32.02 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Command A", "company": "Cohere", "accuracy": 66.6, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 10.13 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.1 Instruct Turbo (405B)", "company": "Meta", "accuracy": 66.3, "cost_input": "$3.50", "cost_output": "$3.50", "latency": 24.91 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 66.1, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.04 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 65.3, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 16.3 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 65.2, "cost_input": "N", "cost_output": "A", "latency": 14.29 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 64.9, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 9.0 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 64.9, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 9.91 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 63.9, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 3.86 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 63.0, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 5.0 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 62.9, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 7.95 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.5 Large", "company": "Ai21 Labs", "accuracy": 62.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 20.32 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.1 Instruct Turbo (70B)", "company": "Meta", "accuracy": 61.1, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 4.39 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 59.0, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 7.09 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 54.1, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 8.97 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 53.4, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 3.74 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 50.3, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 4.39 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.5 Mini", "company": "Ai21 Labs", "accuracy": 46.9, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 4.82 }, { "benchmark": "tax_eval_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.1 Instruct Turbo (8B)", "company": "Meta", "accuracy": 39.0, "cost_input": "$0.18", "cost_output": "$0.18", "latency": 2.45 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 50.9, "cost_input": "$4.40", "cost_output": "N/A", "latency": 161.43 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "o3", "company": "Openai", "accuracy": 48.3, "cost_input": "$0.74", "cost_output": "N/A", "latency": 180.18 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 5", "company": "Openai", "accuracy": 46.9, "cost_input": "$0.78", "cost_output": "N/A", "latency": 504.18 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 46.1, "cost_input": "$4.29", "cost_output": "N/A", "latency": 135.34 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 44.5, "cost_input": "$0.85", "cost_output": "N/A", "latency": 136.75 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 44.1, "cost_input": "$1.05", "cost_output": "N/A", "latency": 156.21 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4 (Thinking)", "company": "Anthropic", "accuracy": 43.6, "cost_input": "$3.86", "cost_output": "N/A", "latency": 150.92 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 43.5, "cost_input": "$0.89", "cost_output": "N/A", "latency": 105.2 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 42.9, "cost_input": "$0.99", "cost_output": "N/A", "latency": 124.6 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 42.3, "cost_input": "$4.00", "cost_output": "N/A", "latency": 113.27 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Grok 4", "company": "xAI", "accuracy": 40.3, "cost_input": "$1.14", "cost_output": "N/A", "latency": 516.38 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "o4 Mini", "company": "Openai", "accuracy": 36.5, "cost_input": "$0.28", "cost_output": "N/A", "latency": 162.14 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 31.7, "cost_input": "$0.13", "cost_output": "N/A", "latency": 270.68 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Pro Preview", "company": "Google", "accuracy": 29.4, "cost_input": "$0.22", "cost_output": "N/A", "latency": 80.21 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 28.3, "cost_input": "$0.07", "cost_output": "N/A", "latency": 220.89 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1", "company": "Openai", "accuracy": 24.6, "cost_input": "$0.24", "cost_output": "N/A", "latency": 66.47 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3", "company": "xAI", "accuracy": 24.1, "cost_input": "$0.44", "cost_output": "N/A", "latency": 64.48 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 20.8, "cost_input": "$0.07", "cost_output": "N/A", "latency": 50.11 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "o1", "company": "Openai", "accuracy": 20.8, "cost_input": "$1.44", "cost_output": "N/A", "latency": 421.83 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 19.3, "cost_input": "$0.26", "cost_output": "N/A", "latency": 43.41 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 17.1, "cost_input": "$0.07", "cost_output": "N/A", "latency": 79.49 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 14.3, "cost_input": "$0.07", "cost_output": "N/A", "latency": 46.6 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 13.2, "cost_input": "$0.01", "cost_output": "N/A", "latency": 26.16 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "o3 Mini", "company": "Openai", "accuracy": 12.7, "cost_input": "$0.04", "cost_output": "N/A", "latency": 146.86 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 10.8, "cost_input": "$0.04", "cost_output": "N/A", "latency": 96.03 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 10.0, "cost_input": "$0.01", "cost_output": "N/A", "latency": 44.56 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 8.8, "cost_input": "$0.03", "cost_output": "N/A", "latency": 102.14 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Magistral Medium 3.1 (06/2025)", "company": "Mistral", "accuracy": 7.5, "cost_input": "$0.29", "cost_output": "N/A", "latency": 481.04 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Command A", "company": "Cohere", "accuracy": 4.3, "cost_input": "$0.58", "cost_output": "N/A", "latency": 100.95 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 3.6, "cost_input": "$0.00", "cost_output": "N/A", "latency": 10.21 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 3.4, "cost_input": "$0.00", "cost_output": "N/A", "latency": 3.5 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 2.5, "cost_input": "$0.00", "cost_output": "N/A", "latency": 63.48 }, { "benchmark": "finance_agent-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 1.7, "cost_input": "$0.04", "cost_output": "N/A", "latency": 36.01 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 94.4, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 14.28 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 94.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 7.9 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 93.8, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 10.1 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "o4 Mini", "company": "Openai", "accuracy": 93.4, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 8.07 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 93.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.99 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 93.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 23.51 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 5", "company": "Openai", "accuracy": 92.8, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 20.35 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 92.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 4.1 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 92.6, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 12.57 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 92.5, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 31.9 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 92.4, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 2.71 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 92.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 4.86 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 92.3, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 10.17 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 92.2, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 9.39 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 92.1, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 15.68 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 92.0, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 5.79 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "o3", "company": "Openai", "accuracy": 91.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 6.76 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 91.7, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 18.03 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Grok 3", "company": "xAI", "accuracy": 91.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.83 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "o3 Mini", "company": "Openai", "accuracy": 91.3, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 15.98 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 91.1, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 3.01 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 90.9, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 11.83 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Grok 4", "company": "xAI", "accuracy": 90.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 116.62 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 90.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 14.09 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 90.9, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 5.01 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 90.7, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 6.73 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 90.4, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 10.07 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 90.4, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 6.56 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 90.4, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 3.98 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 90.0, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 5.95 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 89.8, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 2.13 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "o1", "company": "Openai", "accuracy": 89.3, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 11.21 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 89.3, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 22.78 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 89.2, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 2.8 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 89.0, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 1.54 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 89.0, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 5.79 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 88.0, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 3.5 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 87.8, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 2.75 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1", "company": "Openai", "accuracy": 87.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 2.19 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 87.2, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 8.04 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 86.6, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 1.41 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 86.4, "cost_input": "N", "cost_output": "A", "latency": 16.27 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 86.2, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 4.03 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Grok 2", "company": "xAI", "accuracy": 86.1, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 6.27 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Command A", "company": "Cohere", "accuracy": 85.7, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 8.36 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 84.6, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 3.74 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 84.2, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 3.87 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 84.0, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 2.95 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.5 Large", "company": "Ai21 Labs", "accuracy": 77.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 10.65 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 71.2, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 9.86 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 69.3, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 1.46 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 41.7, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 3.93 }, { "benchmark": "mgsm-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.5 Mini", "company": "Ai21 Labs", "accuracy": 29.6, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 2.99 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 86.6, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 33.67 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "o3", "company": "Openai", "accuracy": 83.9, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 63.95 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Grok 4", "company": "xAI", "accuracy": 83.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 229.4 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 83.2, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 81.7 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "o4 Mini", "company": "Openai", "accuracy": 82.2, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 32.84 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 80.4, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 109.45 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 2.5 Pro Preview", "company": "Google", "accuracy": 79.2, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 164.66 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 5", "company": "Openai", "accuracy": 77.1, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 159.34 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 76.2, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 213.66 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "o3 Mini", "company": "Openai", "accuracy": 71.5, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 53.8 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 70.6, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 429.48 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 70.4, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 66.65 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 70.2, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 86.07 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Opus 4 (Thinking)", "company": "Anthropic", "accuracy": 70.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 93.54 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 66.5, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 92.17 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 66.3, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 30.71 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 65.5, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 26.93 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 64.6, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 32.51 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 62.6, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 14.78 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 62.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 42.61 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 60.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 96.47 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 59.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 14.06 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 58.4, "cost_input": "N", "cost_output": "A", "latency": 152.4 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 58.2, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 27.59 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 56.9, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 16.41 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 56.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 16.23 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 4.1", "company": "Openai", "accuracy": 54.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 30.56 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Grok 3", "company": "xAI", "accuracy": 52.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.52 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "o1", "company": "Openai", "accuracy": 50.3, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 92.08 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 49.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 11.04 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 47.3, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 9.44 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 46.9, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 124.95 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 44.8, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 12.48 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 43.6, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.36 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 43.4, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 4.35 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 42.7, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 11.8 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 41.9, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 10.58 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 41.7, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 3.95 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Grok 2", "company": "xAI", "accuracy": 38.7, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 2.38 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 38.5, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 16.76 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 37.1, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 5.35 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 36.9, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 2.51 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 36.3, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 2.95 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 36.3, "cost_input": "N", "cost_output": "A", "latency": 44.23 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Command A", "company": "Cohere", "accuracy": 35.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 11.75 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 31.8, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 4.01 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 26.4, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 7.36 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 22.3, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 4.84 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Command R+", "company": "Cohere", "accuracy": 18.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 5.46 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 15.8, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 5.04 }, { "benchmark": "lcb-08-12-2025", "benchmark_group": "Coding", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 9.9, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 1.14 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 50.9, "cost_input": "$4.40", "cost_output": "N/A", "latency": 161.43 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "o3", "company": "Openai", "accuracy": 48.3, "cost_input": "$0.74", "cost_output": "N/A", "latency": 180.18 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 5", "company": "Openai", "accuracy": 46.9, "cost_input": "$0.78", "cost_output": "N/A", "latency": 504.18 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 46.1, "cost_input": "$4.29", "cost_output": "N/A", "latency": 135.34 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 44.5, "cost_input": "$0.85", "cost_output": "N/A", "latency": 136.75 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 44.1, "cost_input": "$1.05", "cost_output": "N/A", "latency": 156.21 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Opus 4 (Thinking)", "company": "Anthropic", "accuracy": 43.6, "cost_input": "$3.86", "cost_output": "N/A", "latency": 150.92 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 43.5, "cost_input": "$0.89", "cost_output": "N/A", "latency": 105.2 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 42.9, "cost_input": "$0.99", "cost_output": "N/A", "latency": 124.6 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 42.3, "cost_input": "$4.00", "cost_output": "N/A", "latency": 113.27 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Grok 4", "company": "xAI", "accuracy": 40.3, "cost_input": "$1.14", "cost_output": "N/A", "latency": 516.38 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "o4 Mini", "company": "Openai", "accuracy": 36.5, "cost_input": "$0.28", "cost_output": "N/A", "latency": 162.14 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 31.7, "cost_input": "$0.13", "cost_output": "N/A", "latency": 270.68 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Gemini 2.5 Pro Preview", "company": "Google", "accuracy": 29.4, "cost_input": "$0.22", "cost_output": "N/A", "latency": 80.21 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 28.3, "cost_input": "$0.07", "cost_output": "N/A", "latency": 220.89 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 4.1", "company": "Openai", "accuracy": 24.6, "cost_input": "$0.24", "cost_output": "N/A", "latency": 66.47 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Grok 3", "company": "xAI", "accuracy": 24.1, "cost_input": "$0.44", "cost_output": "N/A", "latency": 64.48 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 20.8, "cost_input": "$0.07", "cost_output": "N/A", "latency": 50.11 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "o1", "company": "Openai", "accuracy": 20.8, "cost_input": "$1.44", "cost_output": "N/A", "latency": 421.83 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 19.3, "cost_input": "$0.26", "cost_output": "N/A", "latency": 43.41 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 17.1, "cost_input": "$0.07", "cost_output": "N/A", "latency": 79.49 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 14.3, "cost_input": "$0.07", "cost_output": "N/A", "latency": 46.6 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 13.2, "cost_input": "$0.01", "cost_output": "N/A", "latency": 26.16 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "o3 Mini", "company": "Openai", "accuracy": 12.7, "cost_input": "$0.04", "cost_output": "N/A", "latency": 146.86 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 10.8, "cost_input": "$0.04", "cost_output": "N/A", "latency": 96.03 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 10.0, "cost_input": "$0.01", "cost_output": "N/A", "latency": 44.56 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 8.8, "cost_input": "$0.03", "cost_output": "N/A", "latency": 102.14 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Magistral Medium 3.1 (06/2025)", "company": "Mistral", "accuracy": 7.5, "cost_input": "$0.29", "cost_output": "N/A", "latency": 481.04 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Command A", "company": "Cohere", "accuracy": 4.3, "cost_input": "$0.58", "cost_output": "N/A", "latency": 100.95 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 3.6, "cost_input": "$0.00", "cost_output": "N/A", "latency": 10.21 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 3.4, "cost_input": "$0.00", "cost_output": "N/A", "latency": 3.5 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 2.5, "cost_input": "$0.00", "cost_output": "N/A", "latency": 63.48 }, { "benchmark": "finance_agent", "benchmark_group": "Finance", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 1.7, "cost_input": "$0.04", "cost_output": "N/A", "latency": 36.01 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 5", "company": "Openai", "accuracy": 73.5, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 64.99 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 4", "company": "xAI", "accuracy": 73.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 28.52 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 72.2, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 43.3 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1", "company": "Openai", "accuracy": 71.2, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 10.33 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "o3", "company": "Openai", "accuracy": 71.0, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 19.13 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "o4 Mini", "company": "Openai", "accuracy": 70.1, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 17.5 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3", "company": "xAI", "accuracy": 69.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 23.86 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 68.6, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 14.12 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 68.4, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 17.99 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 68.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 22.33 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 67.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 227.37 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 66.6, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 35.03 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 66.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 10.42 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 65.8, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 6.56 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 63.2, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 46.8 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 62.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 8.67 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 62.2, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 12.3 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 60.9, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 36.59 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 60.7, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 28.88 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 60.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 59.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 162.67 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 58.2, "cost_input": "$1.00", "cost_output": "$5.00", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Grok 2", "company": "xAI", "accuracy": 58.2, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 84.8 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 58.1, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 30.56 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 57.6, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 6.59 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 56.6, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 6.35 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "o3 Mini", "company": "Openai", "accuracy": 55.7, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 31.42 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 55.0, "cost_input": "$0.15", "cost_output": "$0.60", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Command A", "company": "Cohere", "accuracy": 54.5, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 14.34 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 53.9, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 9.22 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 53.2, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 12.25 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Pro Exp", "company": "Google", "accuracy": 53.1, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 18.13 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 50.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 29.61 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 50.4, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 5.3 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 50.3, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 37.35 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 49.3, "cost_input": "$2.50", "cost_output": "$10.00", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.1 Instruct Turbo (70B)", "company": "Meta", "accuracy": 47.2, "cost_input": "$0.88", "cost_output": "$0.88", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.5 Large", "company": "Ai21 Labs", "accuracy": 46.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 10.74 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 46.6, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 28.38 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 46.0, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 4.28 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Llama 3.1 Instruct Turbo (8B)", "company": "Meta", "accuracy": 43.5, "cost_input": "$0.18", "cost_output": "$0.18", "latency": null }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Jamba 1.5 Mini", "company": "Ai21 Labs", "accuracy": 39.9, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 2.34 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 38.5, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 31.35 }, { "benchmark": "corp_fin_v2-08-12-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Flash (001)", "company": "Google", "accuracy": 32.9, "cost_input": "$0.07", "cost_output": "$0.30", "latency": null }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Grok 4", "company": "xAI", "accuracy": 88.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 115.52 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5", "company": "Openai", "accuracy": 85.6, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 169.72 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "o3", "company": "Openai", "accuracy": 83.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 65.78 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 80.3, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 41.1 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 80.3, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 67.13 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 79.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 40.62 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 75.5, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 35.77 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 75.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 155.02 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "o3 Mini", "company": "Openai", "accuracy": 75.0, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 99.98 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "o4 Mini", "company": "Openai", "accuracy": 74.5, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 30.0 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 74.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 118.46 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3", "company": "xAI", "accuracy": 73.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 29.91 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "o1", "company": "Openai", "accuracy": 73.0, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 40.18 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 72.7, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 14.23 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 71.7, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 19.13 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 71.5, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 41.43 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 69.9, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 35.93 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 69.7, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 56.18 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 69.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 12.82 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 67.7, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 10.07 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 67.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 9.22 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 67.4, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 9.28 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 66.4, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 306.22 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 65.2, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 5.8 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1", "company": "Openai", "accuracy": 64.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 23.14 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 61.1, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 23.48 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 60.6, "cost_input": "N", "cost_output": "A", "latency": 77.59 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 59.6, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 49.72 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 59.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 7.24 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 58.6, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 56.69 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 58.3, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 7.71 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 56.8, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 145.09 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 54.0, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 21.28 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 53.3, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 43.45 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 53.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 14.92 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 50.8, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 7.99 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o (2024-05-13)", "company": "Openai", "accuracy": 50.3, "cost_input": "$5.00", "cost_output": "$15.00", "latency": 10.08 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 50.0, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 7.34 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Grok 2", "company": "xAI", "accuracy": 50.0, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 20.18 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 46.0, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 3.39 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 45.2, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 15.8 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 44.4, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 12.09 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 44.2, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 15.32 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 41.4, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 8.82 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 40.9, "cost_input": "N", "cost_output": "A", "latency": 16.48 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 39.9, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 4.19 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 37.9, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 6.56 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "GPT 3.5", "company": "Openai", "accuracy": 29.3, "cost_input": "$0.50", "cost_output": "$1.50", "latency": 2.63 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Command A", "company": "Cohere", "accuracy": 29.3, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 11.49 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Command R+", "company": "Cohere", "accuracy": 29.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 8.19 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 24.2, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 15.47 }, { "benchmark": "gpqa-08-12-2025", "benchmark_group": "Academic", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 20.5, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 4.95 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Grok 4", "company": "xAI", "accuracy": 96.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 27.26 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 5", "company": "Openai", "accuracy": 96.0, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 32.22 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 95.4, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 45.98 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 95.2, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 25.83 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 94.8, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 12.06 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 94.6, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 142.75 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "o3", "company": "Openai", "accuracy": 94.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 16.59 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 94.2, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 22.77 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "o4 Mini", "company": "Openai", "accuracy": 94.2, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 12.54 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 94.2, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 37.35 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 93.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 63.47 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 93.8, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 22.4 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 93.0, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 30.93 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 92.2, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 156.47 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 91.8, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 23.66 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "o3 Mini", "company": "Openai", "accuracy": 91.8, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 14.36 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 91.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 94.24 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 91.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 9.5 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 91.4, "cost_input": "N", "cost_output": "A", "latency": 42.83 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 90.4, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 14.81 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "o1", "company": "Openai", "accuracy": 90.4, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 23.54 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 90.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 10.03 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Grok 3", "company": "xAI", "accuracy": 89.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 9.09 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 88.6, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 12.54 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 88.0, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.37 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 88.0, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 5.76 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4.1", "company": "Openai", "accuracy": 87.2, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 12.53 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 87.0, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 14.13 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 85.2, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 7.47 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 2.0 Flash Thinking Exp", "company": "Google", "accuracy": 84.6, "cost_input": "$0.10", "cost_output": "$0.70", "latency": 11.8 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 82.8, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 5.02 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 80.4, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 7.94 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 80.2, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.37 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 79.2, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 10.52 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 78.8, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 2.65 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Grok 2", "company": "xAI", "accuracy": 78.4, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 20.44 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 76.8, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.53 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Command A", "company": "Cohere", "accuracy": 76.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 8.66 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 75.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 12.29 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 74.4, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 9.93 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 74.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 12.8 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 73.4, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 5.41 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 72.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 6.3 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 72.4, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 4.63 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.1 Instruct Turbo (405B)", "company": "Meta", "accuracy": 71.4, "cost_input": "$3.50", "cost_output": "$3.50", "latency": 45.3 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 71.2, "cost_input": "N", "cost_output": "A", "latency": 12.55 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 70.6, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 4.89 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 70.2, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 9.89 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 68.4, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 6.41 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.1 Instruct Turbo (70B)", "company": "Meta", "accuracy": 65.0, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 9.2 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 64.2, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 5.13 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 54.8, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 13.01 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Llama 3.1 Instruct Turbo (8B)", "company": "Meta", "accuracy": 44.4, "cost_input": "$0.18", "cost_output": "$0.18", "latency": 5.84 }, { "benchmark": "math500-08-12-2025", "benchmark_group": "Math", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 25.4, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 4.86 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o1", "company": "Openai", "accuracy": 96.5, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 11.15 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 5", "company": "Openai", "accuracy": 96.3, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 35.5 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 96.2, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 12.93 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o3", "company": "Openai", "accuracy": 96.1, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 8.51 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o4 Mini", "company": "Openai", "accuracy": 96.0, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 6.6 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o3 Mini", "company": "Openai", "accuracy": 94.8, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 8.39 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 93.6, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 23.61 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 93.2, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 17.49 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 93.1, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 10.7 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o1 Preview", "company": "Openai", "accuracy": 93.0, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 16.44 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 92.9, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 11.93 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 92.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 26.99 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 92.5, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 13.97 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Grok 4", "company": "xAI", "accuracy": 92.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 64.25 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Grok 2", "company": "xAI", "accuracy": 92.3, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 4.09 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 91.4, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 12.38 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4.1", "company": "Openai", "accuracy": 91.2, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 3.08 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 91.0, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 8.87 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 90.8, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 41.57 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 90.6, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 26.26 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 90.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 8.71 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "o1 Mini", "company": "Openai", "accuracy": 90.2, "cost_input": "$3.00", "cost_output": "$12.00", "latency": 5.89 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 90.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 15.74 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 90.1, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 7.06 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 88.6, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 4.88 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Llama 3.1 Instruct Turbo (405B)", "company": "Meta", "accuracy": 88.2, "cost_input": "$3.50", "cost_output": "$3.50", "latency": 8.75 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 88.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 3.39 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 86.7, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 2.25 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Llama 3.1 Instruct Turbo (70B)", "company": "Meta", "accuracy": 84.8, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 4.8 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 84.6, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 1.86 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 84.0, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 10.98 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Grok 3", "company": "xAI", "accuracy": 83.9, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 7.54 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 83.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.92 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 82.9, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 25.35 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 82.0, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 12.88 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4 Turbo", "company": "Openai", "accuracy": 82.0, "cost_input": "$10.00", "cost_output": "$30.00", "latency": 8.41 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 80.9, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 6.82 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Command A", "company": "Cohere", "accuracy": 80.5, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 2.91 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 78.2, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 7.34 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Qwen 2.5 Instruct Turbo (72B)", "company": "Alibaba", "accuracy": 77.4, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 5.57 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 76.5, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 5.93 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 76.2, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 4.86 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 72.4, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 2.32 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 69.1, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 3.63 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 68.2, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 1.42 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Jamba 1.5 Large", "company": "Ai21 Labs", "accuracy": 68.1, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 6.0 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Llama 3.1 Instruct Turbo (8B)", "company": "Meta", "accuracy": 62.6, "cost_input": "$0.18", "cost_output": "$0.18", "latency": 2.37 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "GPT 3.5", "company": "Openai", "accuracy": 58.5, "cost_input": "$0.50", "cost_output": "$1.50", "latency": 1.59 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 57.0, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 2.76 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Jamba 1.5 Mini", "company": "Ai21 Labs", "accuracy": 55.2, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 1.14 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Mixtral (8x7B)", "company": "Mistral", "accuracy": 53.2, "cost_input": "$0.60", "cost_output": "$0.60", "latency": 3.49 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 52.5, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 2.19 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Command R+", "company": "Cohere", "accuracy": 51.4, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 6.06 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 50.9, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 4.8 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 50.7, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 7.31 }, { "benchmark": "medqa-08-12-2025", "benchmark_group": "Healthcare", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 43.3, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 7.75 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 80.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 5.67 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 79.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 13.58 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4.1", "company": "Openai", "accuracy": 79.0, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 5.03 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 78.8, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 8.91 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "o3", "company": "Openai", "accuracy": 78.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 21.22 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 78.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 4.22 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 77.7, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 4.6 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "o4 Mini", "company": "Openai", "accuracy": 77.1, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 13.0 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 5", "company": "Openai", "accuracy": 76.8, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 79.53 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 75.4, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 15.69 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 75.2, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 7.36 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 75.0, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 8.31 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 74.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 8.42 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 72.7, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 2.72 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 72.6, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.79 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 72.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 5.98 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 71.7, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 2.43 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 71.0, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 4.2 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 70.9, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 3.56 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 2.5 Flash Preview (Thinking)", "company": "Google", "accuracy": 69.5, "cost_input": "$0.15", "cost_output": "$3.50", "latency": 8.4 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 69.2, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 6.06 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 67.0, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 3.86 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Nova Lite", "company": "Amazon", "accuracy": 66.1, "cost_input": "$0.06", "cost_output": "$0.24", "latency": null }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 65.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 12.0 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 64.9, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 49.32 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Nova Pro", "company": "Amazon", "accuracy": 63.7, "cost_input": "$0.80", "cost_output": "$3.20", "latency": null }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 62.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 46.32 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 60.5, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 7.14 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 59.0, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 16.3 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Grok 4", "company": "xAI", "accuracy": 57.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 65.02 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Llama 3.2 Vision (90B)", "company": "Meta", "accuracy": 55.0, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 5.07 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 54.6, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 2.1 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 53.4, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 31.02 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Llama 3.2 Vision (11B)", "company": "Meta", "accuracy": 38.8, "cost_input": "$0.18", "cost_output": "$0.18", "latency": 2.45 }, { "benchmark": "mortgage_tax-08-08-2025", "benchmark_group": "Finance", "model": "Grok 2 Vision", "company": "xAI", "accuracy": 26.7, "cost_input": "$2.00", "cost_output": "$10.00", "latency": null }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 5", "company": "Openai", "accuracy": 93.4, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 292.2 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 92.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 156.63 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 90.8, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 114.15 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Grok 4", "company": "xAI", "accuracy": 90.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 133.23 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "o3 Mini", "company": "Openai", "accuracy": 86.5, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 154.65 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 86.0, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 244.71 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.5 Pro Exp", "company": "Google", "accuracy": 85.8, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 143.91 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "o3", "company": "Openai", "accuracy": 85.3, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 266.18 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 85.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 102.25 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Qwen 3 (235B)", "company": "Alibaba", "accuracy": 84.0, "cost_input": "$0.22", "cost_output": "$0.88", "latency": 242.17 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "o4 Mini", "company": "Openai", "accuracy": 83.7, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 55.11 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 83.3, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 240.6 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 78.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 214.5 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 76.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 271.79 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 74.0, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 153.91 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "o1", "company": "Openai", "accuracy": 71.5, "cost_input": "$15.00", "cost_output": "$60.00", "latency": 177.03 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 70.6, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 31.4 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 62.7, "cost_input": "$1.00", "cost_output": "$3.00", "latency": 124.67 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Grok 3", "company": "xAI", "accuracy": 58.7, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 63.99 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Llama 3.3 Nemotron Super (Thinking)", "company": "Nvidia", "accuracy": 53.5, "cost_input": "N", "cost_output": "A", "latency": 167.89 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 52.2, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 50.57 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 49.4, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 33.14 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 44.6, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 303.71 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 44.2, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 29.68 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Mistral Medium 3.1 (05/2025)", "company": "Mistral", "accuracy": 42.3, "cost_input": "$0.40", "cost_output": "$2.00", "latency": 65.95 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude Opus 4 (Nonthinking)", "company": "Anthropic", "accuracy": 41.3, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 37.03 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1", "company": "Openai", "accuracy": 39.6, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 161.08 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 38.5, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 23.4 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Gemini 2.0 Flash (001)", "company": "Google", "accuracy": 29.8, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 11.21 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "DeepSeek V3", "company": "Deepseek", "accuracy": 27.5, "cost_input": "$0.90", "cost_output": "$0.90", "latency": 58.8 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 26.5, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 11.91 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 25.2, "cost_input": "$0.27", "cost_output": "$0.85", "latency": 15.5 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 22.3, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 18.93 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Llama 4 Scout", "company": "Meta", "accuracy": 19.0, "cost_input": "$0.18", "cost_output": "$0.59", "latency": 21.69 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Gemini 1.5 Pro (002)", "company": "Google", "accuracy": 18.7, "cost_input": "$1.25", "cost_output": "$5.00", "latency": 10.64 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Gemini 1.5 Flash (002)", "company": "Google", "accuracy": 17.3, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 5.7 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Llama 3.3 Instruct Turbo (70B)", "company": "Meta", "accuracy": 16.0, "cost_input": "$0.88", "cost_output": "$0.88", "latency": 11.24 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Grok 2", "company": "xAI", "accuracy": 15.2, "cost_input": "$2.00", "cost_output": "$10.00", "latency": 57.88 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 14.0, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 68.37 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Command A", "company": "Cohere", "accuracy": 13.3, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 23.35 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 11.9, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 15.78 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "GPT 4o Mini", "company": "Openai", "accuracy": 11.5, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 28.77 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.5 Sonnet Latest", "company": "Anthropic", "accuracy": 10.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 9.19 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Llama 3.3 Nemotron Super (Nonthinking)", "company": "Nvidia", "accuracy": 9.4, "cost_input": "N", "cost_output": "A", "latency": 15.85 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Mistral Large (11/2024)", "company": "Mistral", "accuracy": 9.2, "cost_input": "$2.00", "cost_output": "$6.00", "latency": 19.68 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Mistral Small (02/2024)", "company": "Mistral", "accuracy": 5.6, "cost_input": "$0.20", "cost_output": "$0.60", "latency": 13.23 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Mistral Small 3.1 (03/2025)", "company": "Mistral", "accuracy": 3.5, "cost_input": "$0.07", "cost_output": "$0.30", "latency": 11.68 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Claude 3.5 Haiku Latest", "company": "Anthropic", "accuracy": 3.3, "cost_input": "$1.00", "cost_output": "$5.00", "latency": 9.05 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.6 Large", "company": "Ai21 Labs", "accuracy": 0.4, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 18.86 }, { "benchmark": "aime-2025-08-12", "benchmark_group": "Math", "model": "Jamba 1.6 Mini", "company": "Ai21 Labs", "accuracy": 0.4, "cost_input": "$0.20", "cost_output": "$0.40", "latency": 6.62 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 65.0, "cost_input": "$1.24", "cost_output": "N/A", "latency": 426.52 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Grok 4", "company": "xAI", "accuracy": 58.6, "cost_input": "$1.21", "cost_output": "N/A", "latency": 704.78 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "o3", "company": "Openai", "accuracy": 49.8, "cost_input": "$1.42", "cost_output": "N/A", "latency": 620.33 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "GPT 4.1", "company": "Openai", "accuracy": 47.4, "cost_input": "$0.45", "cost_output": "N/A", "latency": 173.98 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Gemini 2.5 Pro Preview", "company": "Google", "accuracy": 46.8, "cost_input": "$0.88", "cost_output": "N/A", "latency": 540.96 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Grok 3", "company": "xAI", "accuracy": 42.0, "cost_input": "$1.19", "cost_output": "N/A", "latency": 123.17 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Gemini 2.5 Flash Preview (Nonthinking)", "company": "Google", "accuracy": 35.6, "cost_input": "$0.11", "cost_output": "N/A", "latency": 251.91 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 34.8, "cost_input": "$0.13", "cost_output": "N/A", "latency": 233.12 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Kimi K2 Instruct", "company": "Kimi", "accuracy": 34.2, "cost_input": "$0.79", "cost_output": "N/A", "latency": 498.43 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "o4 Mini", "company": "Openai", "accuracy": 33.4, "cost_input": "$1.54", "cost_output": "N/A", "latency": 976.81 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 27.2, "cost_input": "$1.53", "cost_output": "N/A", "latency": 197.58 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Llama 4 Maverick", "company": "Meta", "accuracy": 18.4, "cost_input": "$0.12", "cost_output": "N/A", "latency": 62.48 }, { "benchmark": "swebench-2025-07-30", "benchmark_group": "Coding", "model": "Command A", "company": "Cohere", "accuracy": 0.2, "cost_input": "$0.01", "cost_output": "N/A", "latency": 5.26 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 4.1", "company": "Openai", "accuracy": 78.1, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 35.04 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 5 Mini", "company": "Openai", "accuracy": 77.5, "cost_input": "$0.25", "cost_output": "$2.00", "latency": 24.4 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Grok 4", "company": "xAI", "accuracy": 76.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 40.13 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Grok 3", "company": "xAI", "accuracy": 75.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 33.66 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 5", "company": "Openai", "accuracy": 74.9, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 27.95 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 4.1 Mini", "company": "Openai", "accuracy": 74.6, "cost_input": "$0.40", "cost_output": "$1.60", "latency": 21.16 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude Sonnet 4 (Nonthinking)", "company": "Anthropic", "accuracy": 74.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 16.25 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude Sonnet 4 (Thinking)", "company": "Anthropic", "accuracy": 74.0, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 24.91 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "DeepSeek V3 (03/24/2025)", "company": "Deepseek", "accuracy": 73.6, "cost_input": "$1.20", "cost_output": "$1.20", "latency": 17.58 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Gemini 2.5 Pro", "company": "Google", "accuracy": 72.7, "cost_input": "$1.25", "cost_output": "$10.00", "latency": 25.63 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude Opus 4.1 (Thinking)", "company": "Anthropic", "accuracy": 72.3, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 68.28 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude Opus 4.1 (Nonthinking)", "company": "Anthropic", "accuracy": 71.1, "cost_input": "$15.00", "cost_output": "$75.00", "latency": 35.04 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "DeepSeek R1", "company": "Deepseek", "accuracy": 70.1, "cost_input": "$3.00", "cost_output": "$8.00", "latency": 23.64 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude 3.7 Sonnet (Thinking)", "company": "Anthropic", "accuracy": 70.1, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 42.95 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 4o (2024-11-20)", "company": "Openai", "accuracy": 69.8, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 39.81 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "o3", "company": "Openai", "accuracy": 69.5, "cost_input": "$2.00", "cost_output": "$8.00", "latency": 48.92 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Gemini 2.5 Flash (Nonthinking)", "company": "Google", "accuracy": 68.2, "cost_input": "$0.30", "cost_output": "$2.50", "latency": 13.17 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT OSS 120B", "company": "Openai", "accuracy": 66.6, "cost_input": "$0.15", "cost_output": "$0.60", "latency": 16.62 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Claude 3.7 Sonnet (Nonthinking)", "company": "Anthropic", "accuracy": 66.2, "cost_input": "$3.00", "cost_output": "$15.00", "latency": 14.75 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Grok 3 Mini Fast Low Reasoning", "company": "xAI", "accuracy": 65.9, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 14.44 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "o4 Mini", "company": "Openai", "accuracy": 64.0, "cost_input": "$1.10", "cost_output": "$4.40", "latency": 25.84 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "Grok 3 Mini Fast High Reasoning", "company": "xAI", "accuracy": 64.0, "cost_input": "$0.60", "cost_output": "$4.00", "latency": 21.35 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 5 Nano", "company": "Openai", "accuracy": 63.3, "cost_input": "$0.05", "cost_output": "$0.40", "latency": 21.12 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 4o (2024-08-06)", "company": "Openai", "accuracy": 62.1, "cost_input": "$2.50", "cost_output": "$10.00", "latency": 34.81 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT OSS 20B", "company": "Openai", "accuracy": 53.4, "cost_input": "$0.05", "cost_output": "$0.20", "latency": 18.72 }, { "benchmark": "case_law_v2-08-18-2025", "benchmark_group": "Legal", "model": "GPT 4.1 Nano", "company": "Openai", "accuracy": 51.4, "cost_input": "$0.10", "cost_output": "$0.40", "latency": 12.39 } ] }