[ { "benchmark": "swe-bench", "score": 76.8, "metric": "accuracy", "cost_per_instance": 0.77, "average_runtime": 207.0, "full_archive": "https://results.eval.all-hands.dev/swebench/litellm_proxy-anthropic-claude-opus-4-6/23560449752/results.tar.gz", "tags": [ "swe-bench" ], "agent_version": "v1.15.0", "submission_time": "2026-03-26T02:05:02+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/77bb7715-dcff-4718-8e4f-611eae769107" }, { "benchmark": "swt-bench", "score": 78.8, "metric": "accuracy", "cost_per_instance": 0.43, "average_runtime": 138.0, "full_archive": "https://results.eval.all-hands.dev/swtbench/litellm_proxy-anthropic-claude-opus-4-6/21754233398/results.tar.gz", "tags": [ "swt-bench" ], "agent_version": "v1.11.0", "submission_time": "2026-02-06T21:30:23+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/cbf7f857-21d0-4526-b830-25d4e88ab0ed" }, { "benchmark": "swe-bench-multimodal", "score": 41.8, "metric": "solveable_accuracy", "cost_per_instance": 2.37, "average_runtime": 602.0, "full_archive": "https://results.eval.all-hands.dev/swebenchmultimodal/litellm_proxy-anthropic-claude-opus-4-6/21767110679/results.tar.gz", "tags": [ "swe-bench-multimodal" ], "agent_version": "v1.11.0", "submission_time": "2026-02-07T01:54:03+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/0fe00101-1f0a-4698-8a43-70fd665f4936", "component_scores": { "solveable_accuracy": 41.8, "unsolveable_accuracy": 3.0, "combined_accuracy": 29.0, "solveable_resolved": 28, "solveable_total": 67, "unsolveable_resolved": 1, "unsolveable_total": 33 } }, { "benchmark": "gaia", "score": 80.0, "metric": "accuracy", "cost_per_instance": 0.44, "average_runtime": 526.0, "full_archive": "https://results.eval.all-hands.dev/gaia/litellm_proxy-anthropic-claude-opus-4-6/21767030214/results.tar.gz", "tags": [ "gaia" ], "agent_version": "v1.11.0", "submission_time": "2026-02-07T02:55:06+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/09286fea-ded4-4f32-a8b1-97e3a5847b83" }, { "benchmark": "commit0", "score": 56.2, "metric": "accuracy", "cost_per_instance": 7.69, "average_runtime": 1030.0, "full_archive": "https://results.eval.all-hands.dev/commit0/litellm_proxy-anthropic-claude-opus-4-6/24745405298/results.tar.gz", "tags": [ "commit0" ], "agent_version": "v1.17.0", "submission_time": "2026-04-22T02:16:04+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/8a792401-ab45-48b6-8398-99a1a7534f69" } ]