{
    "timestamp_utc": "2025-08-24 20:06:42",
    "benchmarks": [
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 87.8,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 18.05
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 87.6,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 27.67
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 87.0,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 32.52
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 86.1,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 10.41
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o3",
            "company": "Openai",
            "accuracy": 85.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 16.74
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 85.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 89.25
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 84.1,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 18.46
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 83.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 47.09
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o1",
            "company": "Openai",
            "accuracy": 83.5,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 26.87
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 82.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 31.74
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 82.5,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 16.17
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 81.4,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 16.89
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 80.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 6.3
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 80.6,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 10.44
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 80.5,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 8.18
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 80.4,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 52.35
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 80.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 7.43
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 79.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 12.1
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 79.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 9.71
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 79.4,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 6.8
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 79.0,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 35.41
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 78.9,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 25.05
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 78.7,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 22.38
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 78.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 6.29
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 77.9,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 24.28
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 77.4,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 4.32
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 77.2,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 3.83
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 75.5,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 8.75
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 75.3,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 3.34
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 74.4,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 8.83
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 74.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 9.0
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 73.8,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 11.36
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 71.1,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 27.28
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 69.9,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 4.18
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 69.7,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 7.19
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 69.6,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 5.34
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 69.1,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 36.35
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 69.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 9.14
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 67.7,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 30.21
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 67.0,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 17.45
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 66.0,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 3.6
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 65.5,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 1.68
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 65.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 9.85
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 64.4,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 4.66
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 64.1,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 5.79
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 62.7,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 5.09
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 62.3,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 2.4
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 49.3,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 9.48
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Command R+",
            "company": "Cohere",
            "accuracy": 43.9,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 4.42
        },
        {
            "benchmark": "mmlu_pro-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 30.2,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 2.43
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 26.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 4265.88
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 20.0,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 2608.25
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Pro",
            "company": "Google",
            "accuracy": 17.1,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 2964.8
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 15.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 6124.9
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 6.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 3253.12
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 5.3,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 480.19
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 4.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 2216.29
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Flash (Nonthinking)",
            "company": "Google",
            "accuracy": 3.9,
            "cost_input": "$0.30",
            "cost_output": "$2.50",
            "latency": 350.19
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 1.7,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 171.48
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 1.3,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 1551.1
        },
        {
            "benchmark": "IOI_2025_08_11",
            "benchmark_group": "Coding",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 0.0,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 352.57
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 84.6,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 14.75
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 83.6,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 3.51
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 83.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 24.22
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 82.8,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 0.43
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "o3",
            "company": "Openai",
            "accuracy": 82.5,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 5.14
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 82.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 4.92
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 82.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 0.44
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 81.9,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 0.42
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 81.9,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 2.53
        },
        {
            "benchmark": "legal_bench-08-12-2025",
            "benchmark_group": "Legal",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 81.8,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 2.66
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 81.5,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 72.32
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 81.3,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 24.15
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "o3",
            "company": "Openai",
            "accuracy": 80.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 39.27
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 79.7,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 19.72
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 78.9,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 26.7
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 77.5,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 2.45
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "o1",
            "company": "Openai",
            "accuracy": 77.4,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 26.41
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 76.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 102.89
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 75.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 69.58
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 74.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 48.85
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 74.0,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 26.95
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 73.3,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 14.02
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 72.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 12.0
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 72.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 10.82
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 71.7,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 7.91
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 71.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 8.32
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 70.9,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 30.78
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 70.5,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 8.23
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 69.8,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 45.22
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 68.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 46.01
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 65.5,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 43.73
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 64.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 47.53
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 63.0,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 12.56
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 62.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 45.87
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 60.0,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 13.4
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 58.8,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 10.51
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Grok 2 Vision",
            "company": "xAI",
            "accuracy": 57.3,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 18.36
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 57.2,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 41.31
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 56.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 45.02
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 55.1,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 7.25
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.2 Vision (90B)",
            "company": "Meta",
            "accuracy": 48.1,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 56.34
        },
        {
            "benchmark": "mmmu-08-08-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.2 Vision (11B)",
            "company": "Meta",
            "accuracy": 38.8,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": 38.07
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 80.1,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 36.78
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3",
            "company": "Openai",
            "accuracy": 79.0,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 25.18
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 78.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 13.11
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 78.8,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 15.25
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o1",
            "company": "Openai",
            "accuracy": 78.6,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 19.19
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 78.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 43.26
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 78.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 8.03
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 78.3,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 79.88
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 78.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 5.83
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 77.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 29.41
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 76.7,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 22.04
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 76.7,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 162.88
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 76.5,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 11.74
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 76.3,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 51.41
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 75.9,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 32.23
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 75.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 31.99
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 75.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 6.44
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 75.8,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 15.32
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 75.7,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 24.96
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 75.4,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 5.46
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 75.2,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 31.71
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 75.0,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 5.91
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 75.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 9.54
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 74.4,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 15.37
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 74.4,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 94.33
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 74.1,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 12.96
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 73.8,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 81.08
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 73.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.94
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 73.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 12.18
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash Thinking Exp",
            "company": "Google",
            "accuracy": 73.4,
            "cost_input": "$0.10",
            "cost_output": "$0.70",
            "latency": 12.8
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 73.4,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 11.27
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 72.8,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 66.14
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash Exp",
            "company": "Google",
            "accuracy": 72.3,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 7.51
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 72.0,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 25.76
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 71.4,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 10.63
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Pro Exp",
            "company": "Google",
            "accuracy": 70.8,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 9.14
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 70.3,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 5.72
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 69.3,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 28.18
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 68.7,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 52.24
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 67.7,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 12.0
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 67.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 164.42
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 67.1,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 32.02
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 66.6,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 10.13
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.1 Instruct Turbo (405B)",
            "company": "Meta",
            "accuracy": 66.3,
            "cost_input": "$3.50",
            "cost_output": "$3.50",
            "latency": 24.91
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 66.1,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.04
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 65.3,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 16.3
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 65.2,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 14.29
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 64.9,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 9.0
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 64.9,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 9.91
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 63.9,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 3.86
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 63.0,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 5.0
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 62.9,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 7.95
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.5 Large",
            "company": "Ai21 Labs",
            "accuracy": 62.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 20.32
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.1 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 61.1,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 4.39
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 59.0,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 7.09
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 54.1,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 8.97
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 53.4,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 3.74
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 50.3,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 4.39
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.5 Mini",
            "company": "Ai21 Labs",
            "accuracy": 46.9,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 4.82
        },
        {
            "benchmark": "tax_eval_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.1 Instruct Turbo (8B)",
            "company": "Meta",
            "accuracy": 39.0,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": 2.45
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 50.9,
            "cost_input": "$4.40",
            "cost_output": "N/A",
            "latency": 161.43
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3",
            "company": "Openai",
            "accuracy": 48.3,
            "cost_input": "$0.74",
            "cost_output": "N/A",
            "latency": 180.18
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 46.9,
            "cost_input": "$0.78",
            "cost_output": "N/A",
            "latency": 504.18
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 46.1,
            "cost_input": "$4.29",
            "cost_output": "N/A",
            "latency": 135.34
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 44.5,
            "cost_input": "$0.85",
            "cost_output": "N/A",
            "latency": 136.75
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 44.1,
            "cost_input": "$1.05",
            "cost_output": "N/A",
            "latency": 156.21
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 43.6,
            "cost_input": "$3.86",
            "cost_output": "N/A",
            "latency": 150.92
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 43.5,
            "cost_input": "$0.89",
            "cost_output": "N/A",
            "latency": 105.2
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 42.9,
            "cost_input": "$0.99",
            "cost_output": "N/A",
            "latency": 124.6
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 42.3,
            "cost_input": "$4.00",
            "cost_output": "N/A",
            "latency": 113.27
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 40.3,
            "cost_input": "$1.14",
            "cost_output": "N/A",
            "latency": 516.38
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 36.5,
            "cost_input": "$0.28",
            "cost_output": "N/A",
            "latency": 162.14
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 31.7,
            "cost_input": "$0.13",
            "cost_output": "N/A",
            "latency": 270.68
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Pro Preview",
            "company": "Google",
            "accuracy": 29.4,
            "cost_input": "$0.22",
            "cost_output": "N/A",
            "latency": 80.21
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 28.3,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 220.89
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 24.6,
            "cost_input": "$0.24",
            "cost_output": "N/A",
            "latency": 66.47
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 24.1,
            "cost_input": "$0.44",
            "cost_output": "N/A",
            "latency": 64.48
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 20.8,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 50.11
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o1",
            "company": "Openai",
            "accuracy": 20.8,
            "cost_input": "$1.44",
            "cost_output": "N/A",
            "latency": 421.83
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 19.3,
            "cost_input": "$0.26",
            "cost_output": "N/A",
            "latency": 43.41
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 17.1,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 79.49
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 14.3,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 46.6
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 13.2,
            "cost_input": "$0.01",
            "cost_output": "N/A",
            "latency": 26.16
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 12.7,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 146.86
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 10.8,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 96.03
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 10.0,
            "cost_input": "$0.01",
            "cost_output": "N/A",
            "latency": 44.56
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 8.8,
            "cost_input": "$0.03",
            "cost_output": "N/A",
            "latency": 102.14
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Magistral Medium 3.1 (06/2025)",
            "company": "Mistral",
            "accuracy": 7.5,
            "cost_input": "$0.29",
            "cost_output": "N/A",
            "latency": 481.04
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 4.3,
            "cost_input": "$0.58",
            "cost_output": "N/A",
            "latency": 100.95
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 3.6,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 10.21
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 3.4,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 3.5
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 2.5,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 63.48
        },
        {
            "benchmark": "finance_agent-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 1.7,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 36.01
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 94.4,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 14.28
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 94.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 7.9
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 93.8,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 10.1
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 93.4,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 8.07
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 93.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.99
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 93.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 23.51
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 92.8,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 20.35
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 92.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 4.1
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 92.6,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 12.57
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 92.5,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 31.9
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 92.4,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 2.71
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 92.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 4.86
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 92.3,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 10.17
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 92.2,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 9.39
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 92.1,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 15.68
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 92.0,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 5.79
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "o3",
            "company": "Openai",
            "accuracy": 91.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 6.76
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 91.7,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 18.03
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 91.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.83
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 91.3,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 15.98
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 91.1,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 3.01
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 90.9,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 11.83
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 90.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 116.62
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 90.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 14.09
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 90.9,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 5.01
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 90.7,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 6.73
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 90.4,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 10.07
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 90.4,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 6.56
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 90.4,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 3.98
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 90.0,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 5.95
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 89.8,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 2.13
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "o1",
            "company": "Openai",
            "accuracy": 89.3,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 11.21
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 89.3,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 22.78
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 89.2,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 2.8
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 89.0,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 1.54
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 89.0,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 5.79
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 88.0,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 3.5
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 87.8,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 2.75
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 87.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 2.19
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 87.2,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 8.04
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 86.6,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 1.41
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 86.4,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 16.27
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 86.2,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 4.03
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 86.1,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 6.27
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 85.7,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 8.36
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 84.6,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 3.74
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 84.2,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 3.87
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 84.0,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 2.95
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.5 Large",
            "company": "Ai21 Labs",
            "accuracy": 77.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 10.65
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 71.2,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 9.86
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 69.3,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 1.46
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 41.7,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 3.93
        },
        {
            "benchmark": "mgsm-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.5 Mini",
            "company": "Ai21 Labs",
            "accuracy": 29.6,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 2.99
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 86.6,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 33.67
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "o3",
            "company": "Openai",
            "accuracy": 83.9,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 63.95
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 83.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 229.4
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 83.2,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 81.7
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 82.2,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 32.84
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 80.4,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 109.45
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Pro Preview",
            "company": "Google",
            "accuracy": 79.2,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 164.66
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 77.1,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 159.34
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 76.2,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 213.66
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 71.5,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 53.8
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 70.6,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 429.48
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 70.4,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 66.65
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 70.2,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 86.07
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Opus 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 70.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 93.54
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 66.5,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 92.17
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 66.3,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 30.71
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 65.5,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 26.93
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 64.6,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 32.51
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 62.6,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 14.78
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 62.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 42.61
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 60.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 96.47
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 59.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 14.06
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 58.4,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 152.4
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 58.2,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 27.59
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 56.9,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 16.41
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 56.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 16.23
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 54.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 30.56
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 52.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.52
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "o1",
            "company": "Openai",
            "accuracy": 50.3,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 92.08
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 49.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 11.04
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 47.3,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 9.44
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 46.9,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 124.95
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 44.8,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 12.48
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 43.6,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.36
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 43.4,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 4.35
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 42.7,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 11.8
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 41.9,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 10.58
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 41.7,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 3.95
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 38.7,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 2.38
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 38.5,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 16.76
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 37.1,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 5.35
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 36.9,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 2.51
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 36.3,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 2.95
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 36.3,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 44.23
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 35.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 11.75
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 31.8,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 4.01
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 26.4,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 7.36
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 22.3,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 4.84
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Command R+",
            "company": "Cohere",
            "accuracy": 18.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 5.46
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 15.8,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 5.04
        },
        {
            "benchmark": "lcb-08-12-2025",
            "benchmark_group": "Coding",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 9.9,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 1.14
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 50.9,
            "cost_input": "$4.40",
            "cost_output": "N/A",
            "latency": 161.43
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "o3",
            "company": "Openai",
            "accuracy": 48.3,
            "cost_input": "$0.74",
            "cost_output": "N/A",
            "latency": 180.18
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 46.9,
            "cost_input": "$0.78",
            "cost_output": "N/A",
            "latency": 504.18
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 46.1,
            "cost_input": "$4.29",
            "cost_output": "N/A",
            "latency": 135.34
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 44.5,
            "cost_input": "$0.85",
            "cost_output": "N/A",
            "latency": 136.75
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 44.1,
            "cost_input": "$1.05",
            "cost_output": "N/A",
            "latency": 156.21
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 43.6,
            "cost_input": "$3.86",
            "cost_output": "N/A",
            "latency": 150.92
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 43.5,
            "cost_input": "$0.89",
            "cost_output": "N/A",
            "latency": 105.2
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 42.9,
            "cost_input": "$0.99",
            "cost_output": "N/A",
            "latency": 124.6
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 42.3,
            "cost_input": "$4.00",
            "cost_output": "N/A",
            "latency": 113.27
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 40.3,
            "cost_input": "$1.14",
            "cost_output": "N/A",
            "latency": 516.38
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 36.5,
            "cost_input": "$0.28",
            "cost_output": "N/A",
            "latency": 162.14
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 31.7,
            "cost_input": "$0.13",
            "cost_output": "N/A",
            "latency": 270.68
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Pro Preview",
            "company": "Google",
            "accuracy": 29.4,
            "cost_input": "$0.22",
            "cost_output": "N/A",
            "latency": 80.21
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 28.3,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 220.89
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 24.6,
            "cost_input": "$0.24",
            "cost_output": "N/A",
            "latency": 66.47
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 24.1,
            "cost_input": "$0.44",
            "cost_output": "N/A",
            "latency": 64.48
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 20.8,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 50.11
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "o1",
            "company": "Openai",
            "accuracy": 20.8,
            "cost_input": "$1.44",
            "cost_output": "N/A",
            "latency": 421.83
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 19.3,
            "cost_input": "$0.26",
            "cost_output": "N/A",
            "latency": 43.41
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 17.1,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 79.49
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 14.3,
            "cost_input": "$0.07",
            "cost_output": "N/A",
            "latency": 46.6
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 13.2,
            "cost_input": "$0.01",
            "cost_output": "N/A",
            "latency": 26.16
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 12.7,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 146.86
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 10.8,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 96.03
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 10.0,
            "cost_input": "$0.01",
            "cost_output": "N/A",
            "latency": 44.56
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 8.8,
            "cost_input": "$0.03",
            "cost_output": "N/A",
            "latency": 102.14
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Magistral Medium 3.1 (06/2025)",
            "company": "Mistral",
            "accuracy": 7.5,
            "cost_input": "$0.29",
            "cost_output": "N/A",
            "latency": 481.04
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 4.3,
            "cost_input": "$0.58",
            "cost_output": "N/A",
            "latency": 100.95
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 3.6,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 10.21
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 3.4,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 3.5
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 2.5,
            "cost_input": "$0.00",
            "cost_output": "N/A",
            "latency": 63.48
        },
        {
            "benchmark": "finance_agent",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 1.7,
            "cost_input": "$0.04",
            "cost_output": "N/A",
            "latency": 36.01
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 73.5,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 64.99
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 73.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 28.52
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 72.2,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 43.3
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 71.2,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 10.33
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3",
            "company": "Openai",
            "accuracy": 71.0,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 19.13
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 70.1,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 17.5
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 69.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 23.86
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 68.6,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 14.12
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 68.4,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 17.99
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 68.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 22.33
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 67.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 227.37
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 66.6,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 35.03
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 66.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 10.42
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 65.8,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 6.56
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 63.2,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 46.8
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 62.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 8.67
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 62.2,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 12.3
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 60.9,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 36.59
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 60.7,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 28.88
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 60.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 59.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 162.67
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 58.2,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 58.2,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 84.8
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 58.1,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 30.56
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 57.6,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 6.59
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 56.6,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 6.35
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 55.7,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 31.42
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 55.0,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 54.5,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 14.34
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 53.9,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 9.22
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 53.2,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 12.25
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Pro Exp",
            "company": "Google",
            "accuracy": 53.1,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 18.13
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 50.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 29.61
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 50.4,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 5.3
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 50.3,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 37.35
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 49.3,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.1 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 47.2,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.5 Large",
            "company": "Ai21 Labs",
            "accuracy": 46.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 10.74
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 46.6,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 28.38
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 46.0,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 4.28
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.1 Instruct Turbo (8B)",
            "company": "Meta",
            "accuracy": 43.5,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": null
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Jamba 1.5 Mini",
            "company": "Ai21 Labs",
            "accuracy": 39.9,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 2.34
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 38.5,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 31.35
        },
        {
            "benchmark": "corp_fin_v2-08-12-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Flash (001)",
            "company": "Google",
            "accuracy": 32.9,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": null
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 88.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 115.52
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 85.6,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 169.72
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o3",
            "company": "Openai",
            "accuracy": 83.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 65.78
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 80.3,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 41.1
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 80.3,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 67.13
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 79.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 40.62
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 75.5,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 35.77
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 75.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 155.02
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 75.0,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 99.98
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 74.5,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 30.0
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 74.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 118.46
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 73.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 29.91
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "o1",
            "company": "Openai",
            "accuracy": 73.0,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 40.18
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 72.7,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 14.23
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 71.7,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 19.13
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 71.5,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 41.43
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 69.9,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 35.93
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 69.7,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 56.18
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 69.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 12.82
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 67.7,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 10.07
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 67.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 9.22
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 67.4,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 9.28
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 66.4,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 306.22
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 65.2,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 5.8
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 64.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 23.14
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 61.1,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 23.48
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 60.6,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 77.59
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 59.6,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 49.72
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 59.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 7.24
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 58.6,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 56.69
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 58.3,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 7.71
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 56.8,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 145.09
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 54.0,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 21.28
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 53.3,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 43.45
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 53.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 14.92
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 50.8,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 7.99
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o (2024-05-13)",
            "company": "Openai",
            "accuracy": 50.3,
            "cost_input": "$5.00",
            "cost_output": "$15.00",
            "latency": 10.08
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 50.0,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 7.34
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 50.0,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 20.18
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 46.0,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 3.39
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 45.2,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 15.8
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 44.4,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 12.09
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 44.2,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 15.32
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 41.4,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 8.82
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 40.9,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 16.48
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 39.9,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 4.19
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 37.9,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 6.56
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "GPT 3.5",
            "company": "Openai",
            "accuracy": 29.3,
            "cost_input": "$0.50",
            "cost_output": "$1.50",
            "latency": 2.63
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 29.3,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 11.49
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Command R+",
            "company": "Cohere",
            "accuracy": 29.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 8.19
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 24.2,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 15.47
        },
        {
            "benchmark": "gpqa-08-12-2025",
            "benchmark_group": "Academic",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 20.5,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 4.95
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 96.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 27.26
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 96.0,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 32.22
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 95.4,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 45.98
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 95.2,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 25.83
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 94.8,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 12.06
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 94.6,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 142.75
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "o3",
            "company": "Openai",
            "accuracy": 94.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 16.59
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 94.2,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 22.77
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 94.2,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 12.54
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 94.2,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 37.35
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 93.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 63.47
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 93.8,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 22.4
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 93.0,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 30.93
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 92.2,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 156.47
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 91.8,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 23.66
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 91.8,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 14.36
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 91.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 94.24
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 91.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 9.5
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 91.4,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 42.83
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 90.4,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 14.81
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "o1",
            "company": "Openai",
            "accuracy": 90.4,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 23.54
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 90.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 10.03
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 89.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 9.09
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 88.6,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 12.54
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 88.0,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.37
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 88.0,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 5.76
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 87.2,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 12.53
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 87.0,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 14.13
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 85.2,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 7.47
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 2.0 Flash Thinking Exp",
            "company": "Google",
            "accuracy": 84.6,
            "cost_input": "$0.10",
            "cost_output": "$0.70",
            "latency": 11.8
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 82.8,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 5.02
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 80.4,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 7.94
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 80.2,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.37
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 79.2,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 10.52
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 78.8,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 2.65
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 78.4,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 20.44
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 76.8,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.53
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 76.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 8.66
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 75.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 12.29
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 74.4,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 9.93
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 74.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 12.8
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 73.4,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 5.41
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 72.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 6.3
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 72.4,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 4.63
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.1 Instruct Turbo (405B)",
            "company": "Meta",
            "accuracy": 71.4,
            "cost_input": "$3.50",
            "cost_output": "$3.50",
            "latency": 45.3
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 71.2,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 12.55
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 70.6,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 4.89
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 70.2,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 9.89
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 68.4,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 6.41
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.1 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 65.0,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 9.2
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 64.2,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 5.13
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 54.8,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 13.01
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Llama 3.1 Instruct Turbo (8B)",
            "company": "Meta",
            "accuracy": 44.4,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": 5.84
        },
        {
            "benchmark": "math500-08-12-2025",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 25.4,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 4.86
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o1",
            "company": "Openai",
            "accuracy": 96.5,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 11.15
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 96.3,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 35.5
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 96.2,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 12.93
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o3",
            "company": "Openai",
            "accuracy": 96.1,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 8.51
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 96.0,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 6.6
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 94.8,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 8.39
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 93.6,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 23.61
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 93.2,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 17.49
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 93.1,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 10.7
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o1 Preview",
            "company": "Openai",
            "accuracy": 93.0,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 16.44
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 92.9,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 11.93
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 92.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 26.99
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 92.5,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 13.97
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 92.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 64.25
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 92.3,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 4.09
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 91.4,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 12.38
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 91.2,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 3.08
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 91.0,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 8.87
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 90.8,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 41.57
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 90.6,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 26.26
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 90.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 8.71
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "o1 Mini",
            "company": "Openai",
            "accuracy": 90.2,
            "cost_input": "$3.00",
            "cost_output": "$12.00",
            "latency": 5.89
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 90.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 15.74
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 90.1,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 7.06
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 88.6,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 4.88
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Llama 3.1 Instruct Turbo (405B)",
            "company": "Meta",
            "accuracy": 88.2,
            "cost_input": "$3.50",
            "cost_output": "$3.50",
            "latency": 8.75
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 88.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 3.39
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 86.7,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 2.25
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Llama 3.1 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 84.8,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 4.8
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 84.6,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 1.86
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 84.0,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 10.98
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 83.9,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 7.54
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 83.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.92
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 82.9,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 25.35
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 82.0,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 12.88
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4 Turbo",
            "company": "Openai",
            "accuracy": 82.0,
            "cost_input": "$10.00",
            "cost_output": "$30.00",
            "latency": 8.41
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 80.9,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 6.82
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 80.5,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 2.91
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 78.2,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 7.34
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Qwen 2.5 Instruct Turbo (72B)",
            "company": "Alibaba",
            "accuracy": 77.4,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 5.57
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 76.5,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 5.93
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 76.2,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 4.86
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 72.4,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 2.32
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 69.1,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 3.63
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 68.2,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 1.42
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Jamba 1.5 Large",
            "company": "Ai21 Labs",
            "accuracy": 68.1,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 6.0
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Llama 3.1 Instruct Turbo (8B)",
            "company": "Meta",
            "accuracy": 62.6,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": 2.37
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "GPT 3.5",
            "company": "Openai",
            "accuracy": 58.5,
            "cost_input": "$0.50",
            "cost_output": "$1.50",
            "latency": 1.59
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 57.0,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 2.76
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Jamba 1.5 Mini",
            "company": "Ai21 Labs",
            "accuracy": 55.2,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 1.14
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Mixtral (8x7B)",
            "company": "Mistral",
            "accuracy": 53.2,
            "cost_input": "$0.60",
            "cost_output": "$0.60",
            "latency": 3.49
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 52.5,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 2.19
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Command R+",
            "company": "Cohere",
            "accuracy": 51.4,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 6.06
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 50.9,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 4.8
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 50.7,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 7.31
        },
        {
            "benchmark": "medqa-08-12-2025",
            "benchmark_group": "Healthcare",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 43.3,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 7.75
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 80.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 5.67
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 79.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 13.58
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 79.0,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 5.03
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 78.8,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 8.91
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "o3",
            "company": "Openai",
            "accuracy": 78.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 21.22
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 78.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 4.22
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 77.7,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 4.6
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 77.1,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 13.0
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 76.8,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 79.53
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 75.4,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 15.69
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 75.2,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 7.36
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 75.0,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 8.31
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 74.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 8.42
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 72.7,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 2.72
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 72.6,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.79
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 72.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 5.98
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 71.7,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 2.43
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 71.0,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 4.2
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 70.9,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 3.56
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 2.5 Flash Preview (Thinking)",
            "company": "Google",
            "accuracy": 69.5,
            "cost_input": "$0.15",
            "cost_output": "$3.50",
            "latency": 8.4
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 69.2,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 6.06
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 67.0,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 3.86
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Nova Lite",
            "company": "Amazon",
            "accuracy": 66.1,
            "cost_input": "$0.06",
            "cost_output": "$0.24",
            "latency": null
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 65.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 12.0
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 64.9,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 49.32
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Nova Pro",
            "company": "Amazon",
            "accuracy": 63.7,
            "cost_input": "$0.80",
            "cost_output": "$3.20",
            "latency": null
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 62.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 46.32
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 60.5,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 7.14
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 59.0,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 16.3
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 57.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 65.02
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.2 Vision (90B)",
            "company": "Meta",
            "accuracy": 55.0,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 5.07
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 54.6,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 2.1
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 53.4,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 31.02
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Llama 3.2 Vision (11B)",
            "company": "Meta",
            "accuracy": 38.8,
            "cost_input": "$0.18",
            "cost_output": "$0.18",
            "latency": 2.45
        },
        {
            "benchmark": "mortgage_tax-08-08-2025",
            "benchmark_group": "Finance",
            "model": "Grok 2 Vision",
            "company": "xAI",
            "accuracy": 26.7,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": null
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 93.4,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 292.2
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 92.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 156.63
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 90.8,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 114.15
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 90.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 133.23
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "o3 Mini",
            "company": "Openai",
            "accuracy": 86.5,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 154.65
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 86.0,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 244.71
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.5 Pro Exp",
            "company": "Google",
            "accuracy": 85.8,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 143.91
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "o3",
            "company": "Openai",
            "accuracy": 85.3,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 266.18
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 85.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 102.25
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Qwen 3 (235B)",
            "company": "Alibaba",
            "accuracy": 84.0,
            "cost_input": "$0.22",
            "cost_output": "$0.88",
            "latency": 242.17
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 83.7,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 55.11
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 83.3,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 240.6
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 78.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 214.5
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 76.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 271.79
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 74.0,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 153.91
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "o1",
            "company": "Openai",
            "accuracy": 71.5,
            "cost_input": "$15.00",
            "cost_output": "$60.00",
            "latency": 177.03
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 70.6,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 31.4
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 62.7,
            "cost_input": "$1.00",
            "cost_output": "$3.00",
            "latency": 124.67
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 58.7,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 63.99
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Nemotron Super (Thinking)",
            "company": "Nvidia",
            "accuracy": 53.5,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 167.89
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 52.2,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 50.57
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 49.4,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 33.14
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 44.6,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 303.71
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 44.2,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 29.68
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Medium 3.1 (05/2025)",
            "company": "Mistral",
            "accuracy": 42.3,
            "cost_input": "$0.40",
            "cost_output": "$2.00",
            "latency": 65.95
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Opus 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 41.3,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 37.03
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 39.6,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 161.08
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 38.5,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 23.4
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 2.0 Flash (001)",
            "company": "Google",
            "accuracy": 29.8,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 11.21
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "DeepSeek V3",
            "company": "Deepseek",
            "accuracy": 27.5,
            "cost_input": "$0.90",
            "cost_output": "$0.90",
            "latency": 58.8
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 26.5,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 11.91
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 25.2,
            "cost_input": "$0.27",
            "cost_output": "$0.85",
            "latency": 15.5
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 22.3,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 18.93
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 4 Scout",
            "company": "Meta",
            "accuracy": 19.0,
            "cost_input": "$0.18",
            "cost_output": "$0.59",
            "latency": 21.69
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Pro (002)",
            "company": "Google",
            "accuracy": 18.7,
            "cost_input": "$1.25",
            "cost_output": "$5.00",
            "latency": 10.64
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Gemini 1.5 Flash (002)",
            "company": "Google",
            "accuracy": 17.3,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 5.7
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Instruct Turbo (70B)",
            "company": "Meta",
            "accuracy": 16.0,
            "cost_input": "$0.88",
            "cost_output": "$0.88",
            "latency": 11.24
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Grok 2",
            "company": "xAI",
            "accuracy": 15.2,
            "cost_input": "$2.00",
            "cost_output": "$10.00",
            "latency": 57.88
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 14.0,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 68.37
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 13.3,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 23.35
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 11.9,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 15.78
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "GPT 4o Mini",
            "company": "Openai",
            "accuracy": 11.5,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 28.77
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Sonnet Latest",
            "company": "Anthropic",
            "accuracy": 10.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 9.19
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Llama 3.3 Nemotron Super (Nonthinking)",
            "company": "Nvidia",
            "accuracy": 9.4,
            "cost_input": "N",
            "cost_output": "A",
            "latency": 15.85
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Large (11/2024)",
            "company": "Mistral",
            "accuracy": 9.2,
            "cost_input": "$2.00",
            "cost_output": "$6.00",
            "latency": 19.68
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Small (02/2024)",
            "company": "Mistral",
            "accuracy": 5.6,
            "cost_input": "$0.20",
            "cost_output": "$0.60",
            "latency": 13.23
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Mistral Small 3.1 (03/2025)",
            "company": "Mistral",
            "accuracy": 3.5,
            "cost_input": "$0.07",
            "cost_output": "$0.30",
            "latency": 11.68
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Claude 3.5 Haiku Latest",
            "company": "Anthropic",
            "accuracy": 3.3,
            "cost_input": "$1.00",
            "cost_output": "$5.00",
            "latency": 9.05
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Large",
            "company": "Ai21 Labs",
            "accuracy": 0.4,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 18.86
        },
        {
            "benchmark": "aime-2025-08-12",
            "benchmark_group": "Math",
            "model": "Jamba 1.6 Mini",
            "company": "Ai21 Labs",
            "accuracy": 0.4,
            "cost_input": "$0.20",
            "cost_output": "$0.40",
            "latency": 6.62
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 65.0,
            "cost_input": "$1.24",
            "cost_output": "N/A",
            "latency": 426.52
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 58.6,
            "cost_input": "$1.21",
            "cost_output": "N/A",
            "latency": 704.78
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "o3",
            "company": "Openai",
            "accuracy": 49.8,
            "cost_input": "$1.42",
            "cost_output": "N/A",
            "latency": 620.33
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 47.4,
            "cost_input": "$0.45",
            "cost_output": "N/A",
            "latency": 173.98
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Pro Preview",
            "company": "Google",
            "accuracy": 46.8,
            "cost_input": "$0.88",
            "cost_output": "N/A",
            "latency": 540.96
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 42.0,
            "cost_input": "$1.19",
            "cost_output": "N/A",
            "latency": 123.17
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Gemini 2.5 Flash Preview (Nonthinking)",
            "company": "Google",
            "accuracy": 35.6,
            "cost_input": "$0.11",
            "cost_output": "N/A",
            "latency": 251.91
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 34.8,
            "cost_input": "$0.13",
            "cost_output": "N/A",
            "latency": 233.12
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Kimi K2 Instruct",
            "company": "Kimi",
            "accuracy": 34.2,
            "cost_input": "$0.79",
            "cost_output": "N/A",
            "latency": 498.43
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 33.4,
            "cost_input": "$1.54",
            "cost_output": "N/A",
            "latency": 976.81
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 27.2,
            "cost_input": "$1.53",
            "cost_output": "N/A",
            "latency": 197.58
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Llama 4 Maverick",
            "company": "Meta",
            "accuracy": 18.4,
            "cost_input": "$0.12",
            "cost_output": "N/A",
            "latency": 62.48
        },
        {
            "benchmark": "swebench-2025-07-30",
            "benchmark_group": "Coding",
            "model": "Command A",
            "company": "Cohere",
            "accuracy": 0.2,
            "cost_input": "$0.01",
            "cost_output": "N/A",
            "latency": 5.26
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4.1",
            "company": "Openai",
            "accuracy": 78.1,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 35.04
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 5 Mini",
            "company": "Openai",
            "accuracy": 77.5,
            "cost_input": "$0.25",
            "cost_output": "$2.00",
            "latency": 24.4
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Grok 4",
            "company": "xAI",
            "accuracy": 76.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 40.13
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Grok 3",
            "company": "xAI",
            "accuracy": 75.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 33.66
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 5",
            "company": "Openai",
            "accuracy": 74.9,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 27.95
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4.1 Mini",
            "company": "Openai",
            "accuracy": 74.6,
            "cost_input": "$0.40",
            "cost_output": "$1.60",
            "latency": 21.16
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude Sonnet 4 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 74.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 16.25
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude Sonnet 4 (Thinking)",
            "company": "Anthropic",
            "accuracy": 74.0,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 24.91
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "DeepSeek V3 (03/24/2025)",
            "company": "Deepseek",
            "accuracy": 73.6,
            "cost_input": "$1.20",
            "cost_output": "$1.20",
            "latency": 17.58
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Gemini 2.5 Pro",
            "company": "Google",
            "accuracy": 72.7,
            "cost_input": "$1.25",
            "cost_output": "$10.00",
            "latency": 25.63
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude Opus 4.1 (Thinking)",
            "company": "Anthropic",
            "accuracy": 72.3,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 68.28
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude Opus 4.1 (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 71.1,
            "cost_input": "$15.00",
            "cost_output": "$75.00",
            "latency": 35.04
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "DeepSeek R1",
            "company": "Deepseek",
            "accuracy": 70.1,
            "cost_input": "$3.00",
            "cost_output": "$8.00",
            "latency": 23.64
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude 3.7 Sonnet (Thinking)",
            "company": "Anthropic",
            "accuracy": 70.1,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 42.95
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4o (2024-11-20)",
            "company": "Openai",
            "accuracy": 69.8,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 39.81
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "o3",
            "company": "Openai",
            "accuracy": 69.5,
            "cost_input": "$2.00",
            "cost_output": "$8.00",
            "latency": 48.92
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Gemini 2.5 Flash (Nonthinking)",
            "company": "Google",
            "accuracy": 68.2,
            "cost_input": "$0.30",
            "cost_output": "$2.50",
            "latency": 13.17
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT OSS 120B",
            "company": "Openai",
            "accuracy": 66.6,
            "cost_input": "$0.15",
            "cost_output": "$0.60",
            "latency": 16.62
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Claude 3.7 Sonnet (Nonthinking)",
            "company": "Anthropic",
            "accuracy": 66.2,
            "cost_input": "$3.00",
            "cost_output": "$15.00",
            "latency": 14.75
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Grok 3 Mini Fast Low Reasoning",
            "company": "xAI",
            "accuracy": 65.9,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 14.44
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "o4 Mini",
            "company": "Openai",
            "accuracy": 64.0,
            "cost_input": "$1.10",
            "cost_output": "$4.40",
            "latency": 25.84
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "Grok 3 Mini Fast High Reasoning",
            "company": "xAI",
            "accuracy": 64.0,
            "cost_input": "$0.60",
            "cost_output": "$4.00",
            "latency": 21.35
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 5 Nano",
            "company": "Openai",
            "accuracy": 63.3,
            "cost_input": "$0.05",
            "cost_output": "$0.40",
            "latency": 21.12
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4o (2024-08-06)",
            "company": "Openai",
            "accuracy": 62.1,
            "cost_input": "$2.50",
            "cost_output": "$10.00",
            "latency": 34.81
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT OSS 20B",
            "company": "Openai",
            "accuracy": 53.4,
            "cost_input": "$0.05",
            "cost_output": "$0.20",
            "latency": 18.72
        },
        {
            "benchmark": "case_law_v2-08-18-2025",
            "benchmark_group": "Legal",
            "model": "GPT 4.1 Nano",
            "company": "Openai",
            "accuracy": 51.4,
            "cost_input": "$0.10",
            "cost_output": "$0.40",
            "latency": 12.39
        }
    ]
}