[ { "benchmark": "swt-bench", "score": 72.5, "metric": "accuracy", "cost_per_instance": 0.35, "average_runtime": 140.0, "full_archive": "https://results.eval.all-hands.dev/swtbench/litellm_proxy-openai-gpt-5-4/1778132377/results.tar.gz", "tags": [ "swt-bench" ], "agent_version": "v1.16.1", "acp_agent_name": "codex-acp", "acp_agent_version": "v0.11.1", "submission_time": "2026-04-05T11:29:02+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/add01f5c-e387-4085-8cc2-73c895b561dd" }, { "benchmark": "swe-bench", "score": 73.8, "metric": "accuracy", "cost_per_instance": 0.4, "average_runtime": 171.0, "full_archive": "https://results.eval.all-hands.dev/swebench/litellm_proxy-openai-gpt-5-4/1778132370/results.tar.gz", "tags": [ "swe-bench" ], "agent_version": "v1.16.1", "acp_agent_name": "codex-acp", "acp_agent_version": "v0.11.1", "submission_time": "2026-04-04T19:00:29+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/8390b9bc-8bdc-4a55-b6f8-5a9a32d73cb3" }, { "benchmark": "gaia", "score": 84.8, "metric": "accuracy", "cost_per_instance": 0.22, "average_runtime": 101.0, "full_archive": "https://results.eval.all-hands.dev/gaia/litellm_proxy-openai-gpt-5-4/1778132356/results.tar.gz", "tags": [ "gaia" ], "agent_version": "v1.16.1", "acp_agent_name": "codex-acp", "acp_agent_version": "v0.11.1", "submission_time": "2026-04-06T03:48:35+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/45ff00e9-60cd-4375-ab22-1ab486c82250" }, { "benchmark": "commit0", "score": 50.0, "metric": "accuracy", "cost_per_instance": 1.84, "average_runtime": 504.0, "full_archive": "https://results.eval.all-hands.dev/commit0/litellm_proxy-openai-gpt-5-4/1778132349/results.tar.gz", "tags": [ "commit0" ], "agent_version": "v1.18.1", "acp_agent_name": "codex-acp", "acp_agent_version": "v0.11.1", "submission_time": "2026-04-27T16:06:15+00:00", "eval_visualization_page": "https://laminar.sh/shared/evals/dbfadb0f-8788-4b61-bf95-1eea9484a253" }, { "benchmark": "swe-bench-multimodal", "score": 30.9, "metric": "solveable_accuracy", "cost_per_instance": 1.22, "average_runtime": 356.0, "full_archive": "https://results.eval.all-hands.dev/swebenchmultimodal/litellm_proxy-openai-gpt-5-4/1778132363/results.tar.gz", "tags": [ "swe-bench-multimodal" ], "agent_version": "v1.16.1", "acp_agent_name": "codex-acp", "acp_agent_version": "v0.11.1", "submission_time": "2026-04-04T18:14:15+00:00", "component_scores": { "solveable_accuracy": 30.9, "unsolveable_accuracy": 0.0, "combined_accuracy": 20.6 }, "eval_visualization_page": "https://laminar.sh/shared/evals/510f0b23-b51a-4f20-a82d-0c0463ec8a60" } ]