{ "schema_version": 1, "experiment_id": "harnessbench-v2-official-2026-05-04c", "matrix_id": "harnessbench-v2-official-2026-05-04c", "created_at": "2026-05-06T20:11:06.639Z", "started_at": "2026-05-04T10:18:18.111Z", "finished_at": "2026-05-06T12:31:27.489Z", "success": true, "runner": { "git_commit": "2d110db6c6a4b1aa512e27dafb020f63ce0259b0", "git_status_short": "M benchmark/cases/go-gitea__gitea/hidden-tests/mid/core.sh\n M benchmark/cases/go-gitea__gitea/hidden-tests/mid/regression.sh\n M benchmark/cases/jesseduffield__lazygit/hidden-tests/high/core.sh\n M benchmark/cases/jesseduffield__lazygit/hidden-tests/high/regression.sh\n M benchmark/cases/langflow-ai__langflow/mid.yaml\n M benchmark/cases/louislam__uptime-kuma/hidden-tests/low/core.sh\n M benchmark/cases/louislam__uptime-kuma/hidden-tests/low/regression.sh\n M benchmark/cases/louislam__uptime-kuma/hidden-tests/mid/regression.sh\n M benchmark/cases/louislam__uptime-kuma/high.yaml\n M benchmark/cases/vitejs__vite/hidden-tests/high/core.sh\n M benchmark/cases/vitejs__vite/low.yaml\n M benchmark/reports/index.html\n M scripts/regrade-agent-results.mjs\n M scripts/render-results.mjs\n M scripts/review-failed-runs.mjs", "scripts": { "scripts/run-case.mjs": { "sha256": "abf463d8ede276d758fc21f26f51a96c65763c47b2bd77780f13da819bb951f8" }, "scripts/run-matrix.mjs": { "sha256": "c26052056dea2215a214691b44f627ebc5c6ff2cabf995f98aab6a8a563c4a31" }, "scripts/resume-agent-matrix.mjs": { "sha256": "807a2e518fcb2c43f9438c0e3ae55ac95af6738a52e98a62ef3338545dfcefb4" }, "scripts/render-results.mjs": { "sha256": "eaae230bc750cbcd32f9267f483ae9e6a06ebf9b8e5b1bc06bac74e67ddafaf7" }, "scripts/render-experiment-index.mjs": { "sha256": "0d50c9b12011aa90d5b87e2a86e0df54552112ceb6a4fcd5efac63b8cad325dc" }, "scripts/review-failed-runs.mjs": { "sha256": "2b24211c2e3646654cfa47d1aef769e6d8daaaa1f665ba3762dab4b85d03db6e" }, "scripts/regrade-agent-results.mjs": { "sha256": "01fd5c4b4834e58ebdbc628fc2d06c664341f2c2582b64d6cbd3153318c7feb4" } } }, "scoring_revision": { "date": "2026-05-07", "summary": "Regraded preserved workspaces after hidden-test adequacy and instruction-contract fixes. Scoring remains two-layer core_and_regression; all final auxiliary failure reviews classify remaining failures as true_failure." }, "inputs": { "case_count": 27, "cases": [ { "id": "axios-axios-high-http-connect-timeout", "repo": "axios/axios", "difficulty": "high", "size_bucket": "small", "case_path": "benchmark/cases/axios__axios/high.yaml", "base_commit": "2a51828213128691d2e37502b5eb2cf4965a737d", "fixed_commit": "ad68e1a484b50086af427f767bbd7d6e3aab7ac3" }, { "id": "axios-axios-low-settle-error-code", "repo": "axios/axios", "difficulty": "low", "size_bucket": "small", "case_path": "benchmark/cases/axios__axios/low.yaml", "base_commit": "e57349992f230b6b13e80613eb84302560aa5ba8", "fixed_commit": "5107ee69aee527b19eabaf80000ca65752135435" }, { "id": "axios-axios-mid-fetch-global-access", "repo": "axios/axios", "difficulty": "mid", "size_bucket": "small", "case_path": "benchmark/cases/axios__axios/mid.yaml", "base_commit": "ad68e1a484b50086af427f767bbd7d6e3aab7ac3", "fixed_commit": "e57349992f230b6b13e80613eb84302560aa5ba8" }, { "id": "fastapi-fastapi-high-pydantic-json-fast-path", "repo": "fastapi/fastapi", "difficulty": "high", "size_bucket": "large", "case_path": "benchmark/cases/fastapi__fastapi/high.yaml", "base_commit": "1e78a36b7310003f0ff634627d8a7bc53c6ccdee", "fixed_commit": "590a5e535587cc07041ba12d308c748433ccb168" }, { "id": "fastapi-fastapi-low-remove-vibe-decorator", "repo": "fastapi/fastapi", "difficulty": "low", "size_bucket": "large", "case_path": "benchmark/cases/fastapi__fastapi/low.yaml", "base_commit": "70580da818722cce68b7a88928d67bd0f64f42c5", "fixed_commit": "ae4e45c5cc20a1e1503fbcab2369821d188feb09" }, { "id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "repo": "fastapi/fastapi", "difficulty": "mid", "size_bucket": "large", "case_path": "benchmark/cases/fastapi__fastapi/mid.yaml", "base_commit": "cbd64b09a32681d3b0ea097608bc62eb0d1587e0", "fixed_commit": "7815a32f2ed177b8b786a48b3e0712c05b5c644f" }, { "id": "go-gitea-gitea-high-compare-no-common-history", "repo": "go-gitea/gitea", "difficulty": "high", "size_bucket": "large", "case_path": "benchmark/cases/go-gitea__gitea/high.yaml", "base_commit": "fedc9dc993f0f14866fa4dcbe57c6ba8d90a180e", "fixed_commit": "deec2b0929c5a7badd74df8bcb767a8cce51e33f" }, { "id": "go-gitea-gitea-low-schedule-null-payload", "repo": "go-gitea/gitea", "difficulty": "low", "size_bucket": "large", "case_path": "benchmark/cases/go-gitea__gitea/low.yaml", "base_commit": "78899832eb806bb3ee2e536115af2e5ef0094706", "fixed_commit": "c4a1ff7d1665a8eecc488b8edec6aebc946ed062" }, { "id": "go-gitea-gitea-mid-pr-merge-self-reference", "repo": "go-gitea/gitea", "difficulty": "mid", "size_bucket": "large", "case_path": "benchmark/cases/go-gitea__gitea/mid.yaml", "base_commit": "2d1306291b63a95725c281118340ce2dabd71c8c", "fixed_commit": "a16ca3c57cc1fc05a1839bb604a82abcc62052c5" }, { "id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "repo": "jesseduffield/lazygit", "difficulty": "high", "size_bucket": "small", "case_path": "benchmark/cases/jesseduffield__lazygit/high.yaml", "base_commit": "01b617ea184f9fbbdc7a9dad1bad3977c2ecebb7", "fixed_commit": "b4f79851dba1337e72b6b2f0180239c9d59d4f8f" }, { "id": "jesseduffield-lazygit-low-github-owner-casing", "repo": "jesseduffield/lazygit", "difficulty": "low", "size_bucket": "small", "case_path": "benchmark/cases/jesseduffield__lazygit/low.yaml", "base_commit": "8f258a3650cef809b911df24881712bc6b5d96bd", "fixed_commit": "38dd035e289dd71ad16fb0caa34525ad03460d21" }, { "id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "repo": "jesseduffield/lazygit", "difficulty": "mid", "size_bucket": "small", "case_path": "benchmark/cases/jesseduffield__lazygit/mid.yaml", "base_commit": "2ad6e1a90497ba6ad520176beed6c99a89fd64ba", "fixed_commit": "0ecf818a739606d4255e009b66309e6e508ce08a" }, { "id": "langflow-ai-langflow-high-lfx-stream-fallback", "repo": "langflow-ai/langflow", "difficulty": "high", "size_bucket": "large", "case_path": "benchmark/cases/langflow-ai__langflow/high.yaml", "base_commit": "3f2d2fc39920149fd5552b8903a49734f3ea9c59", "fixed_commit": "784169cee7adb6ea20c062f5ac2dc735cdd26527" }, { "id": "langflow-ai-langflow-low-loguru-file-routing", "repo": "langflow-ai/langflow", "difficulty": "low", "size_bucket": "large", "case_path": "benchmark/cases/langflow-ai__langflow/low.yaml", "base_commit": "9b11f5a383c77847f3d200b171f1bd76cd548326", "fixed_commit": "d68b312421fc9bd6b24670ffe6da6e83ecd241b3" }, { "id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "repo": "langflow-ai/langflow", "difficulty": "mid", "size_bucket": "large", "case_path": "benchmark/cases/langflow-ai__langflow/mid.yaml", "base_commit": "1a9ffa05c0b9d38eb866ff81c9470c545ecc6d13", "fixed_commit": "d1b73e9f18549c76e27bda1b6daf1627ec701049" }, { "id": "louislam-uptime-kuma-high-websocket-auth-options", "repo": "louislam/uptime-kuma", "difficulty": "high", "size_bucket": "medium", "case_path": "benchmark/cases/louislam__uptime-kuma/high.yaml", "base_commit": "7d7f12b5b18274e385d3b6416b6dd7f6aa212fe4", "fixed_commit": "2f45b46315bf8dae4e906dec41a341765c59a3f0" }, { "id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "repo": "louislam/uptime-kuma", "difficulty": "low", "size_bucket": "medium", "case_path": "benchmark/cases/louislam__uptime-kuma/low.yaml", "base_commit": "aa40ffdf2348cda83ce0e5b9e9c8ad051de52bdf", "fixed_commit": "9b28ddd92358c0f85af5dbbd70e2c726fd851fd7" }, { "id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "repo": "louislam/uptime-kuma", "difficulty": "mid", "size_bucket": "medium", "case_path": "benchmark/cases/louislam__uptime-kuma/mid.yaml", "base_commit": "7136dd7832971beb5691d09e86b28a0fcb5fa0fb", "fixed_commit": "07d28d818180796123b7876c161c2712d80cd0d1" }, { "id": "sharkdp-bat-high-fallback-syntax", "repo": "sharkdp/bat", "difficulty": "high", "size_bucket": "small", "case_path": "benchmark/cases/sharkdp__bat/high.yaml", "base_commit": "ab80bd9717448d988445841fc9634e7d7c2f8cf6", "fixed_commit": "844bfded506e99c06237472bd83a8af5af433538" }, { "id": "sharkdp-bat-low-zip-binary-detection", "repo": "sharkdp/bat", "difficulty": "low", "size_bucket": "small", "case_path": "benchmark/cases/sharkdp__bat/low.yaml", "base_commit": "111aa2e10e7a7f0dcf43c209e643593ac013d623", "fixed_commit": "a995764d230e49e089febd8e5487c1d61e5d3051" }, { "id": "sharkdp-bat-mid-control-character-wrapping", "repo": "sharkdp/bat", "difficulty": "mid", "size_bucket": "small", "case_path": "benchmark/cases/sharkdp__bat/mid.yaml", "base_commit": "4a38eab3eaa191a6a19239bde42f7d5b79d7cb21", "fixed_commit": "f2daa1eb6e2bb21366d856ca9e160698a69fd8eb" }, { "id": "usememos-memos-high-missing-related-users", "repo": "usememos/memos", "difficulty": "high", "size_bucket": "medium", "case_path": "benchmark/cases/usememos__memos/high.yaml", "base_commit": "87d411bc70abcdfe1fd2ab32395c473b955866a2", "fixed_commit": "25feef3aadd34bfd474d7e3b685815a6509bc4c6" }, { "id": "usememos-memos-low-omit-internal-user-settings", "repo": "usememos/memos", "difficulty": "low", "size_bucket": "medium", "case_path": "benchmark/cases/usememos__memos/low.yaml", "base_commit": "0bc56694b0ca347ab1eb083f62997a22007b763d", "fixed_commit": "1df3fe79559ccf94b6c71e9ffb58e870ed43820d" }, { "id": "usememos-memos-mid-mixed-case-user-resource-names", "repo": "usememos/memos", "difficulty": "mid", "size_bucket": "medium", "case_path": "benchmark/cases/usememos__memos/mid.yaml", "base_commit": "1fd6a2a385175399fb515d50d44bee4cda30b40c", "fixed_commit": "01be01f4b7676af41bdd1758b1e9b096aa922546" }, { "id": "vitejs-vite-high-hmr-patch-esm-sentinel", "repo": "vitejs/vite", "difficulty": "high", "size_bucket": "large", "case_path": "benchmark/cases/vitejs__vite/high.yaml", "base_commit": "3ec9cdaac7936ca32d0956c4cb1eb6e172945996", "fixed_commit": "868f1411a6f474baa4417f2d6524692dd452f760" }, { "id": "vitejs-vite-low-flatten-id-sanitized-chars", "repo": "vitejs/vite", "difficulty": "low", "size_bucket": "large", "case_path": "benchmark/cases/vitejs__vite/low.yaml", "base_commit": "a07a4bd052ac75f916391c999c408ad5f2867e61", "fixed_commit": "3f24533ac4845ed22547279d1721bd82a35345e3" }, { "id": "vitejs-vite-mid-deno-workspace-root", "repo": "vitejs/vite", "difficulty": "mid", "size_bucket": "large", "case_path": "benchmark/cases/vitejs__vite/mid.yaml", "base_commit": "bb5203d01c24cf89b4f497ee968ce6f63876b946", "fixed_commit": "1b793c0e1726467fffd06ffad9bc81c61a840188" } ] }, "harness_versions": { "codex": { "versions": { "codex-cli 0.128.0": { "count": 81, "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "binary_path": "/.local/bin/codex", "raw_version_output": "codex-cli 0.128.0\n" } }, "first_seen_at": "2026-05-04T09:34:28.535Z", "last_seen_at": "2026-05-05T09:22:44.390Z" }, "cursor": { "versions": { "2026.05.01-eea359f": { "count": 216, "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "binary_path": "/.local/bin/agent", "raw_version_output": "2026.05.01-eea359f\n" } }, "first_seen_at": "2026-05-04T09:34:27.948Z", "last_seen_at": "2026-05-05T09:22:43.749Z" }, "claude": { "versions": { "2.1.126 (Claude Code)": { "count": 81, "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "binary_path": "/.local/bin/claude", "raw_version_output": "2.1.126 (Claude Code)\n" } }, "first_seen_at": "2026-05-04T09:34:28.577Z", "last_seen_at": "2026-05-05T09:22:44.430Z" } }, "artifacts": { "results_html": "results.html", "summary_json": "summary.json", "failure_reviews_json": "failure-reviews.json" }, "runs": [ { "run_id": "2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "e486b01610e9644834146e48034aca569e4ce9dcf67052d5c013c39c9dbd5ef0", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "eadc263d415b10b60b7451cb668e0e176cf2f799387414446117396229938385", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "2d159d940e37e3e305c621a3f7c23fa96e6a874b9b4a1c897e390bf48d89827a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-18-18-110Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b8856320f06954e1835276db7564bec37a9680289bc2343d3707b568d4ce8d7f", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-18-18-111Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-18-18-111Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-18-18-111Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "3363b72936b9aaa873e2d81ffb75414ac00035068d072501a797348c480d24c4", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-18-18-111Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "82f593791b3f373292bd9647d6ab34ce26c5c214e8e90cc12b88604d00ef6905", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-25-03-421Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-25-03-421Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-25-03-421Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "bd05801ea6067c5a4fd01c01ffa1d41d3976e45b7e7774f6ab26af8a61b9d5c0", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-25-03-421Z-axios-axios-high-http-connect-timeout-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6d013d3a062e9c0022bd53d9e7a4eba316364a77e86be30a7325f42071041110", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-29-32-210Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-29-32-210Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-29-32-210Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "22cbacef900a92d57f317ecc9d6bd805793d163501a0c5e253d19139c0d8f166", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-29-32-210Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "88e2b5538db7d6f2e86ed9f2450c66049041278d549b542b55b3db6715110b89", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-35-12-823Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-35-12-823Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-35-12-823Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "9a2eb63e38c145e0adb2c25cde69784ed8c334a45742bf369c45dd4e6b93c94e", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-35-12-823Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "76903a74c7221890193a44ad79b0c46901a38b1411263e3526df5acb556d5c1c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-39-55-618Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-39-55-618Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-39-55-618Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "b57b1e99e8d6fde65f35b8d90f95654c072922e1bed4604c8306052d61f90a4e", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-39-55-618Z-axios-axios-high-http-connect-timeout-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "caad17284d6bd3ced8c6a07e71d4f139ba9af600d028c12d77d41c9a80209bb9", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-42-48-543Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-42-48-543Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-42-48-543Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "83dc6781d61dd1132ae30268215f5469a75a400c9af8da7259fe15ad05cfc2ab", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-42-48-543Z-axios-axios-high-http-connect-timeout-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3dc0dcf3f9a2cfa84e568d3848420bc669648392d0d1cb7d1fbe1466417c851b", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-46-05-753Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-46-05-753Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-46-05-753Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "236f9140dd4beaba8a7b532db647db461606814fca76148db79046bca20fef22", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-46-05-753Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5f93466356a3dab2f64a65d51754acb0c5673d5768385d64b704aa19414e8370", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-49-02-165Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-49-02-165Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-49-02-165Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "06afd2239c19130c304700a46aad821312d82fb935a48214519e504ae9c07fbf", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-49-02-165Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "92ce843e553605669945054f981fe7a0f2c132c5b4968294a24f22d04b3393aa", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-50-37-150Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-50-37-150Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-50-37-150Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "a0748f2cc9bc8f0fbec4b0c1f4d3d6cd2b468dac483cddd2defa8c93a1a83150", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-50-37-150Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0c583b5500f80fe5ba560433cc2e770993858ea919db1c4a0b35d97b0486204b", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-51-07-419Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-51-07-419Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-51-07-419Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "da0e8ef83e416ceb1b38d764f228704ee422eef9846f0b26a24d05f8863372fc", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-51-07-419Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "66b332e022d9685885bc2aa5ce59e286883cd5d27569e22af1231435a3269a5c", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-51-12-338Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-51-12-338Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-51-12-338Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "9a620b7feacf7191920292a7dd67a0442dd2089d4862baac39721bacc0bb2505", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-51-12-338Z-axios-axios-low-settle-error-code-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e37b65f43ce058778e05c0d7935a8677fceaf87e9d5ec67bae33ecd3a33f3c08", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-54-00-119Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-54-00-119Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-54-00-119Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "b4c0ee1261dadce37d298ed21ca6402af332d999309d24aeae21f906e32172da", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-54-00-119Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d83e7df7c8e3027acf1dfde6b7b8f724721b2e17678fa5aaf9722bc54ddc688a", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-54-48-557Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-54-48-557Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-54-48-557Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "0c526874ce128667bce22b55604e4b123c42aa7a0b5a02ef5047c76c8d085f5a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-54-48-557Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "67d626fd3a14fe1f44faac87144a13f8a377487255b10098097f5bce87714c3d", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-56-48-674Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-56-48-674Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-56-48-674Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "bc7213696f0b417893f60264aec0acd6603bcf7c56263e0ef6593e2388424a89", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-56-48-674Z-axios-axios-high-http-connect-timeout-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cbf8938f7564af10cec1eecc5fc56788d982f3a4cb3bc5f4c7ae3e99fb65171f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-58-05-528Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-58-05-528Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-58-05-528Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "f158a0f224ad10d81f91482b0d50ad76c8de2b2676897bcf42766d2f07e66fde", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-58-05-528Z-axios-axios-low-settle-error-code-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5f6c735eeaa92d0d1ad16805bc37fe4029a2beb4515a885fb532779205b1b9f7", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T10-58-44-094Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T10-58-44-094Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T10-58-44-094Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "b40eb76c541eab7cfcd7dc5fe0ddbfd964b959bcc13882e2aca69fcdfb6b3ef8", "prompt_bundle_path": "/benchmark/runs/2026-05-04T10-58-44-094Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "04d4f0c87e7f704ceab1bb3f15bcaa8f061cb366af2c8d8cab200b4bb50c016d", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-03-01-316Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-03-01-316Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-03-01-316Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "1acc3fbccbf018d2448a4a60d1ddd023338b016de11fddd2f63dcb40badee4d5", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-03-01-316Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6ef4f80710fbc3e8817006e2a6212c50a95b83eee7aa51850001b2eef6ff03fb", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-03-42-368Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-03-42-368Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-03-42-368Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "b49585eab2f13116ecfb5f6e2caac0203551d47cbfdf0bdd9f99a04693beda56", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-03-42-368Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "56ae98fd16fd47eff7593e366751d7ffeb28a9ee310f66e664cc8a05bfc7fe09", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-05-43-972Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-05-43-972Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-05-43-972Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "a90ba5641249577a185283b8ae2c459897425bf0d68d47114ec48a2854e58599", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-05-43-972Z-axios-axios-mid-fetch-global-access-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "45136fd498bd01ac6e431ac7953dbe030357b493f4f9a8d49e3c730270ab4ced", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-08-27-212Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-08-27-212Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-08-27-212Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "d0cbb4050638b8596bfa207cc585727adba5f9607d275b2ffebf7563189a117f", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-08-27-212Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "db85afa7c1aba130bfb0f7bcd5d3d4aeb306e1427fcc836aa2c019f54a15fc09", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-10-57-347Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-10-57-347Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-10-57-347Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "b7c1ea97b2a4b22c6e115499d033c5f5901ac74996c6b162c442c4e3ff63e90b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-10-57-347Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b242baf1d7a9c46102fb863995532bef52f78aeb1595651916a6b9c50890a508", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-16-30-063Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-16-30-063Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-16-30-063Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "1fc55ca7009c81701944dddde47079570f94ef2a8e72ef75eacccd7ce0d59b5f", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-16-30-063Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "142d9d50c2107d6983da8896f78c4964ca72d53d77f5b73428cac1a581c3ea3d", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-19-22-668Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-19-22-668Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-19-22-668Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "1b22ce324d82f7452e8a5969668732cf467f0dc4369a485738dd0d439eebfaec", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-19-22-668Z-axios-axios-mid-fetch-global-access-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "53365912afb0712cf1bb12cfec5e049b0746608898d96f517f0a16c4b56468df", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-21-42-457Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-high-http-connect-timeout", "case_path": "/benchmark/cases/axios__axios/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-21-42-457Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-21-42-457Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "4a0fbe4b94f4dbd7378a2dfb223f2111c20c43bc21fff0376a551f0323902c54", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-21-42-457Z-axios-axios-high-http-connect-timeout-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a087061def564f89dca3b5e3a0cdca50491ba3830b64c03e5f2e505b64747672", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-21-43-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-21-43-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-21-43-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "0700bda27a692f8dbe4c2aed1bed5c45f2811264205a42eef87ae8cb7ac23859", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-21-43-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a5db3401b0b71bd1f479171cae69b696f3adf035c3aa6aeba5b53176cef57d7a", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-26-10-682Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-26-10-682Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-26-10-682Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "e060481fa98c90e42113626c7c3bc9eef7a23a0583b6f46d9dfd54d2c157c51a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-26-10-682Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7c9ed425d72c486c7933c8647a5a92e6d3b49962d99c1bcd1b7f2e74459eb23c", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-39-40-893Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-39-40-893Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-39-40-893Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "5a7830306fe024159dac2dfc8d86b95539e2440254a63a141379beabc2513053", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-39-40-893Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e33a8efcce04a074ef87590ef96e083e7f1aaa2a591ef715842c0c8044b07717", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-40-17-580Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-40-17-580Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-40-17-580Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "043dfbb8723bd2b22567bd10d47dc0aa3e2fe12734a1ee4d514145c09ba9b98d", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-40-17-580Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fe439be132955f87ae0c0a079f7d08ff266f963caa0f657b6b66d40fd5ba4f64", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-41-27-498Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-41-27-498Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-41-27-498Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "55fb8fb1212ef4f4180ce392d56d51f8e0a8e4caa3236edc640f8a59e1527b26", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-41-27-498Z-axios-axios-low-settle-error-code-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "eb3ca0af262d28ceb66122990942acd97235a3fe22e3e857a7f288b559c4d6b5", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-42-42-250Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-42-42-250Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-42-42-250Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "63a9de377a98b17512fc8dc0782c4520d75debed54b07cb130a8faefb692099a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-42-42-250Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1998766d31eb6a5c91ce3c70aadf4b24732a6026c86aaa44c3c42f8e9962c353", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-43-59-445Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-43-59-445Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-43-59-445Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "2db265ec6ee21c507245b8ff39f5b5103f6280045468397cd4a1050c12b32e8c", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-43-59-445Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "55b555545954e653ddb7b8bc9b7ae77a966ec1959a143395a6881fea668671fd", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-46-09-752Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-46-09-752Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-46-09-752Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "0ab2c51bd3acd4abfc57a321bdee638fcaded89663e516389e322c9056bacadc", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-46-09-752Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c5cbc780ed9d6f3e1e718d723aea2bb1cdbb527da5c61b0d985485532aa8c622", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-48-48-983Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-48-48-983Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-48-48-983Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "8c2e7846b2b8b8ff9f055fce83356f4317b372d1177f01d5c3fcc3a28d8d67f4", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-48-48-983Z-axios-axios-low-settle-error-code-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "709fda862b25f9129ea2755e869dc5f58f91a691bd250a8430d66d65942da0ea", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-49-21-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-49-21-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-49-21-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "fb13c9b6fd628638e91117ebfeacaad548249b9e6df3a0964be53417d80d527d", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-49-21-251Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "931a0a3e61a3cc40ccd4cd36f3dbeeec574721ad1fb6729ec2e669732cbbbe71", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-51-15-806Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-51-15-806Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-51-15-806Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "02a44e9fd3d349c56ff0679b9f7eca1c273026b0643218e10bf6bedcaee49be7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-51-15-806Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "421151057745a548034728f0970ba1f5507024fd1892d57de602aff0daaf9761", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-54-07-990Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-54-07-990Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-54-07-990Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "c8450af7309cfbb0b2272980cd379df3590773b9e411355dd97abd1668e3100f", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-54-07-990Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0b8637393d2fcaf53df0272bb53051efb4984203c5128b5a8ed26b56b584dedd", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-58-25-374Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-58-25-374Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-58-25-374Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "ac8de18088d1fa1a5fc6798af889a418c62920fc6c86606906e21c40cb4db40c", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-58-25-374Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c90ca8dca4c371070ae76eddba278ab83fa23cf32edc2a3898deb0c09813e2ee", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-58-29-110Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-low-settle-error-code", "case_path": "/benchmark/cases/axios__axios/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-58-29-110Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-58-29-110Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "54a5dfc3364dfab9e67b66e413197390ba4b17016487084ddb52fd9b5efdd0e4", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-58-29-110Z-axios-axios-low-settle-error-code-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a26dbd1545b951231c017eada5a4d9deae36f8805bf761a1b7f900d594993d30", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T11-59-32-793Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T11-59-32-793Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T11-59-32-793Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "3cc1214f2e19ff349878963cb03370f21c53227df7b63daf91dc15a0801b9d0e", "prompt_bundle_path": "/benchmark/runs/2026-05-04T11-59-32-793Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cec86e7effc14345c3f3f3b31fda230e857e6a6c88b6f98c45eeb94aa743f016", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-04-54-802Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-04-54-802Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-04-54-802Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "27efb16843e3cb21061093508cc210532b0961811b5740f545d9d08789a48843", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-04-54-802Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f62db898911277e488ec83881d9bd86a33e7cfb6ab1650e97af4b3e710b07494", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-06-03-070Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-06-03-070Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-06-03-070Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "48032ba87614c23d6e290022f2990aac559aedd615b2561dcd2da129060eb320", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-06-03-070Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3e712192058dab5ed9956468a58b318678998d70280f7760d63ca0e02e516b79", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-08-30-837Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-08-30-837Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-08-30-837Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "99bdc1ec42fcfd08843218233352528a3616d3172dff8ed14e7d65f310b8fed2", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-08-30-837Z-axios-axios-mid-fetch-global-access-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "70d611bb39d8fd41a3b8aac65c549a7b950d1bbfc5f0e081b69d43d12f785d19", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-10-12-390Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-10-12-390Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-10-12-390Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "e259cb4c685b4da3757e33a1ef7f756a7b1cacdcf3f7e77b4cd2ccbff65b0a96", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-10-12-390Z-fastapi-fastapi-low-remove-vibe-decorator-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2843c023168deed366e63848d1242544682579f8e8b5eb3cbf40305d230d17dc", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-13-49-210Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-13-49-210Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-13-49-210Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "a5cf8ad667c65adb48f5b5cf400abfa02d2c5d33d8ff33d65e07b600d60186d7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-13-49-210Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bb660ec21577f02faaca86b3fd36d85f73e01ec55a687da70bf7aa74afc149b1", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-16-21-589Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-16-21-589Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-16-21-589Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "5360f73c66b4275cc05e75acd0ff6f2d5612f5d8cafbe4b23cd0b5e7ec34e6a5", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-16-21-589Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a575858c5368b66139f7c083abb568aa3949267928bc9c0a91b8e935dd624b70", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-17-38-348Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-17-38-348Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-17-38-348Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "22fdbd478f1ca59cbd28e6e2598d512c661f8ca585ab8fe70ab1d544f54f01e3", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-17-38-348Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "85b288dc3933131c12b420b1ab0d3fa7f5c71dc4c3c0dfa09edadd4bcc1c2bcd", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-20-10-425Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-20-10-425Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-20-10-425Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "0fe46e5d6a2037093cb4483ed959ab8fdea42d33979348fe4528e1ed5143b06b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-20-10-425Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9020befde553c6e5325126131bee7db572034ca89a8c742f3d2d051c4cc304c2", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-23-34-597Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-23-34-597Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-23-34-597Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "0919929782184638410a239002018fa8bc037147d099923bc5fa1dd90d4f03af", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-23-34-597Z-fastapi-fastapi-low-remove-vibe-decorator-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f6fdb40f535729d57a400370038eed509ae86426e9655acba3c9410609436fc6", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-24-16-280Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-24-16-280Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-24-16-280Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "aeffd413d951ce3b42735b8a1b41f8f4cdc5b4586b2711c81b02c5224476c8f7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-24-16-280Z-axios-axios-mid-fetch-global-access-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "008428c10812fb623e9fc35d0be8482a9d27e723f3ab58f0320367ed5d504061", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-27-33-202Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-27-33-202Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-27-33-202Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "9989d2fc7beaf3173ace2f3b99eec24fb9a7ae15ed991b0d7210026a6781d91b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-27-33-202Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f7940d3a8ce15f1c6dcf193eb498bb4531d03b2b742c96bb0d0dcd53d8b613a9", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-30-01-233Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-30-01-233Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-30-01-233Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "a9407f36d9710195f254f7db7bc464677e517e1b0e6371ccb996410a030dcd26", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-30-01-233Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b5225a2699affa5880c4fc1120022d8f62369877f7c178980fe10f6719ff9514", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-36-08-450Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-36-08-450Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-36-08-450Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "b5a087993f8fe0fa53fc0ab11b9566672b9969623fb4c72e4cbae10713df774a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-36-08-450Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a86de1e2a75c98c59da6616a92437e6a05e83d7b52ba9a2e0315d61f42164757", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-37-20-645Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-37-20-645Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-37-20-645Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "841a619626ac4ac71c8cd02f9cbc7b5c292b514e83a38593bd3daebf8124ec44", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-37-20-645Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "57de7174a8d4104cfe3665ff603f68ac808e9d0f64eb83a711513bc4961cd15f", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-39-58-548Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-39-58-548Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-39-58-548Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "98c8d24eec59bb60d7f7944d121c00c9dc70446e91dd11df8ec463be1213eaff", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-39-58-548Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "aafa52ce0640162f4b7bf9085837f9475894369bafa9b0e4b3d11a94b7e6fb3e", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-45-50-936Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "axios-axios-mid-fetch-global-access", "case_path": "/benchmark/cases/axios__axios/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-45-50-936Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-45-50-936Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "bcd6a1ffaca67e003a4a495867effc64ce9c137aeb9feee0bdfd7b633b85d1ae", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-45-50-936Z-axios-axios-mid-fetch-global-access-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8404a386e83be720ea3172cc7a9cd491026fcff5d8436876ffab66ccc71ddc6e", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-46-22-801Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-46-22-801Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-46-22-801Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "ff7335329d751c0accf5fdc8962744e317b51bdc75d4991b8f911bd15997b621", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-46-22-801Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c809f4d61ef0a5d1964427b166940d557ace2853ecb2388c58ed83258db021c2", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-51-04-183Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-51-04-183Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-51-04-183Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "c73e667082ae2a6809974781766a7dc4eeac497377e69d64baae12bb562971f3", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-51-04-183Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7b4fcb363f8a37c693592d65d8d76cc7784b1c8a619ca71ffa0d396cee9bd139", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-51-04-324Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-51-04-324Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-51-04-324Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "0a2ab9e2f60c33fe7f7cfb9405b9aa0d1fe1b021c9daa71cb801baeeacb89713", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-51-04-324Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6b29fcb0287c1ef8753431db5f78c6290e9a11b5bb81455dbc6e96088cf3780f", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T12-55-18-791Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T12-55-18-791Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T12-55-18-791Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "2126341cb88fe4231b8d749b45a31ae6d9e42bd88384b273cf3e8c3f4569a143", "prompt_bundle_path": "/benchmark/runs/2026-05-04T12-55-18-791Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c099f35f80993401fa7ed7461d3caa6bfd756a702da265fc5bb7e31eca81e33f", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-06-38-664Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-06-38-664Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-06-38-664Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "8c34a5b38bc631c7fdbed9553b53cacb410f1e9a2d13de0d3866cbd86e2f613a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-06-38-664Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "48952e18f0af46dee1d2b835dbd3ade4b43aa3d68fa42c3ab6257e0e10f563c4", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-11-30-936Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-11-30-936Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-11-30-936Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "d820a77a4bee183b952933bcde8a7eea7301935274a1fc2861c009d31c107eda", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-11-30-936Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "70c9b72c6f076f256bcb049e2145bdc7518066ef4485533015e45f0ab56ba699", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T13-15-52-887Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-15-52-887Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-15-52-887Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "b2ae9304c6d5890db737c03c8c0718484aab0600a8a819338f3fca70cd069d37", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-15-52-887Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "148f49429944b9aea8ede65d14ff732acd9924cbed955e5a67c1116bdbd876de", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T13-20-47-492Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-20-47-492Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-20-47-492Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "9bdf0f7eaabe7c99403798851d845c4ca80ad6b0746fe34f325435454f16d576", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-20-47-492Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "95709192fd0c6e56507bd2f4f979a419fb1a9b22fb4e4070bfe7a588b6c83856", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-25-53-072Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-25-53-072Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-25-53-072Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "5c710cbe650ce5c473c844fd84109e7638e545fe0d0ad2054b4dd03acd5fb098", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-25-53-072Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "670d3568af74835a68022b8275de348a1195d9c37e9137b3cb57abd9bf5c06d8", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-30-19-823Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-30-19-823Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-30-19-823Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "bb019586f2dff5e4405955afe7276948513ddc34cc7683dec16794aad37cb58c", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-30-19-823Z-go-gitea-gitea-high-compare-no-common-history-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "67addf94c4a06476cc395adcac45682c53c0a5c2bc27c926e565a2e465324bd3", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-31-37-697Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-31-37-697Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-31-37-697Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "72d41a9f15990bff0e4cd5c2a3b822b5c38c8c018be9f80edb5881c3f1f496f0", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-31-37-697Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4cad0110b026374a81e336fa8159190484a3c9be7f8c5d3ad174a622c0af0f52", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-37-22-735Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-37-22-735Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-37-22-735Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "df11d055fd36b4c4f84431f27692edce4cd4bce090335562258b586a7f422997", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-37-22-735Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0cf9c946acdaae9ce359d0175cb3cf690531fa90224b3a728ebc7116f2e7e54d", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-41-43-382Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-41-43-382Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-41-43-382Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "b03c76f015a00f39969a66640ade0fa50797058a43bd9da4a8b6b8e30da96d19", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-41-43-382Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3bdd0c3d9690f1f6b79adcfa45202eb0d75456a07384a3d8edf37a1ed7b7691c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-47-38-908Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-47-38-908Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-47-38-908Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "68f93608c939f6a3e774c2b41201072e109f27fce6c450e4db2d430400eacfc3", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-47-38-908Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ff7bd8f123c36dd2470f51781034fa2c142d7416b72f8a82f87b070c2aa77166", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T13-53-15-567Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T13-53-15-567Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T13-53-15-567Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "4760f368448dfe7abf9d830dcadeeb5c4c046e2c715128af4d4ed23d1f072649", "prompt_bundle_path": "/benchmark/runs/2026-05-04T13-53-15-567Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "eeb457cc619fe23f2d3bae9c95f09893fc5200d71c4fa27b3784d8338aa790a0", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T14-06-27-693Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-06-27-693Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-06-27-693Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "8fb7e3ea21b92cdfd98421ec9e2cca69e30e3bcbd50156c0a20d7ddd28f5bb95", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-06-27-693Z-go-gitea-gitea-high-compare-no-common-history-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d6146e3b2651aa7b9395fa9f3d957fa410d0383d6de8acef2cb33712bbf9f5a7", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-06-49-699Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-06-49-699Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-06-49-699Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "8f99568c74f81e4ca8969297d5ac79f29b77b8fcde49bdb826ee6f0430fc26cd", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-06-49-699Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "92370d4f6cbecde62fc9f0c928429377a1071c2537b8d03541c331427ad55566", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-07-19-703Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-high-pydantic-json-fast-path", "case_path": "/benchmark/cases/fastapi__fastapi/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-07-19-703Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-07-19-703Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "048b6dc2021dca125103e9e04a7b7ed59ce460aaf794d4afe623ecc3d2b31cb0", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-07-19-703Z-fastapi-fastapi-high-pydantic-json-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5644225b9d8df41162e7df01f0643dcb445dfa01346fcdb970ac6416df5410cf", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-15-28-723Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-15-28-723Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-15-28-723Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "8d39779598871b3e18904a78c22e5cc2b029a44a1ad002736b96940be2f43ab1", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-15-28-723Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "abdea8eb6c30a09fca10af97753134e3c60214770ae77886a30a3c1c9e1083bf", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-19-50-844Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-19-50-844Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-19-50-844Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "2cad8126bd4ecc8b91d9680d0fe9153ce176a102d335abbcc0288dd964ac331e", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-19-50-844Z-go-gitea-gitea-low-schedule-null-payload-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a5a9efbb95313c286ab958b27e73ca42e3f3a683b290a361ee4a4aca477d3aa2", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-20-49-547Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-20-49-547Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-20-49-547Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "39504030d0a2c63e397de78d00b6cad88795f8412cfe2cbffe53d96270ad9247", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-20-49-547Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c9e009bd083591b967959673d81119f543c8a0a3c72a777436574227d2167cc3", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-25-50-888Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-25-50-888Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-25-50-888Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "aac2547284beb736da93446212c01c44b4890f5c0a5ba73a3cf5dfddb362b524", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-25-50-888Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "88204bbaff68d653b08ea6a8a90b43fa8470d3cd6d147214d0aabd7e73672407", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-26-12-384Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-26-12-384Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-26-12-384Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "e1e3429255bc657cc01aabd1d699d185a7aebf10a88879f2210f059783bc6f5b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-26-12-384Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "20aeaaa1ae0c84fb8d1d6f93cf97129f68473f76646529e856f31a6827e3f639", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-26-53-277Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-26-53-277Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-26-53-277Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "5aa24e55c26608734ad7382c856b17e58dbb3ca9df189bd41d7f9bfe561b26e9", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-26-53-277Z-go-gitea-gitea-low-schedule-null-payload-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "74d3b3ae649b977fc55a4d11ddfc718f985da2d3b8480d892d8511933dec2099", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-29-29-244Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-29-29-244Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-29-29-244Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "15aaffed59148400cece2f83d17c2830eeea40c59ace2ca0ebc5c2b025602104", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-29-29-244Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a8855b47e5b0e6b80fc11fdd1a60186c7b7774e498c917a57e4e1030123b968e", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-30-21-542Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-30-21-542Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-30-21-542Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "7e9d25a41d03d90e7368cac8d1942bb429b75d3f98024ce04ab888316d187122", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-30-21-542Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0e396820f174a52676cbbaa2340c6f877200100b69cad1d5ebcb9d30199250e1", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T14-31-44-400Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-31-44-400Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-31-44-400Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "0e2619022738d6d7118e7049adb1d3850fdfc768e8f4900e73c35bb1ac418581", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-31-44-400Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cd9ac8d12faa3365d3f0377e59e53ea8ee54331f2e5cfa282be5481c7bef66b6", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T14-33-26-828Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-33-26-828Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-33-26-828Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "d43e16ee9528fb72946ea7f64c12515872335b22030beb044e629780cd71a35d", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-33-26-828Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2849b3dcd6f4a513dac93460761c86436634328004c5cad5b0e0650dc8b63acf", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-40-00-506Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-40-00-506Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-40-00-506Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "50f52c2e75a51d12717f06757837676ba68b362bf4267602346fda2aaa7a9d96", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-40-00-506Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7eb7f42be8df1c1b08b653384bfffb2d197b70560b1c32cdb21174efa45b0062", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-41-04-993Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-41-04-993Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-41-04-993Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "bac8e69bd19cd5edea606587179466f82d56626bd821ae6f67e60ef374f45f58", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-41-04-993Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "55751c449c08723a1c686c9cf6c711da47ca9bf38d83140e7c7b50d066e14257", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T14-43-15-454Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-43-15-454Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-43-15-454Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "5eae6c365c22b1472c23b2716da4bcf39b10c94b8569e29455608c5d481a70ed", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-43-15-454Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "45574753386db99b42dd5d48f69fa4443fc47ce77972007af9ec4e79f0402912", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-46-26-811Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-46-26-811Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-46-26-811Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "dda6c5f0f8db471f01deaf11448e8dfa3aa40e49c719a8ac32b184840d0b061f", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-46-26-811Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7ab97333bb6d8547be2cb79353969c8b3781067540f2b211a0902dfc8ae97fd2", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-51-46-396Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-51-46-396Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-51-46-396Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "ede7974b8624405949d9c4ea56e9290da64b5a6f149bd5e8995f94345be18b07", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-51-46-396Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0b83f42f659a9adcf5cde807acb52bb6272309a27b539738841893b59f00f239", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-51-59-042Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-51-59-042Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-51-59-042Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "24f3a995a88e8202346f6f6336e5b0a2819754b7e8a52384471f2d9f1c9a8a77", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-51-59-042Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "785dcac6a3ce6a392dfb462ce07f21a1c5f4aa3ac9bc070a7c7eb7e541473b4e", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-56-56-702Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-56-56-702Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-56-56-702Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "2bb1e69a46289ff758ea576bc00f313f16107868fe595ee11034e01df08215e2", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-56-56-702Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "365dfb7c515afc394ea1666b848050368f6d67ad3462ad072f24cb5b492b1f52", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T14-56-58-213Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T14-56-58-213Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T14-56-58-213Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "48a10cd752031e5303dee89072c0af29ee5a2f0a0b5f81eaa75bca1fa4674f3c", "prompt_bundle_path": "/benchmark/runs/2026-05-04T14-56-58-213Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2b601fc98e210e5480a19adf712a8f5b26304672ce9da43cf02e3cf0deb51152", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-01-01-023Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-01-01-023Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-01-01-023Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "d5a034e94595a9cea1b46a2d4e0d70c2a673afe9395eb8d44c107e884865051c", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-01-01-023Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "18c7eb2b8f0486ee7d23038e7cfe1ad922df2b6fc1381ed42d154ca40550a8a7", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T15-05-24-402Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-05-24-402Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-05-24-402Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "716bad4ae14d6c0ca4f19f6862b3e3ff8a9dfae4488e0ee6a9674dec7c1f5f81", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-05-24-402Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "40e7facd5b0189145f9ef2dc62db0b4251142fa06aae6bd1754a7b0430e86fb0", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T15-06-51-203Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-06-51-203Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-06-51-203Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "89b85e346c63fa81bb465b7aefdd08848139ad5d23c30826e277d105e34b2a3b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-06-51-203Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ca9966f6172d26795302125c61586b71aef0633b2640e442b2b7f17e0f948f03", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-09-13-511Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-low-remove-vibe-decorator", "case_path": "/benchmark/cases/fastapi__fastapi/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-09-13-511Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-09-13-511Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "2cd461cb064ab3793f0b87688ca8afb453f5def9fec5798080b92276beaf373b", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-09-13-511Z-fastapi-fastapi-low-remove-vibe-decorator-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "35728423aa0b0bb4a65ff6e235e8c2708bf4b7ba79b032f530fed4dfa55995af", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-16-09-048Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-16-09-048Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-16-09-048Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "7fd1ca6f07eb93e27c293f0977742dabc1b0bd5f505b7127a3165cb4f926ed91", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-16-09-048Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9605d83f33f90e2a0640637492a8a5cef240cfae01a4a5c06683b93ca613f25a", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-16-23-538Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-16-23-538Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-16-23-538Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "527da2064d0e8ccca9e8c381fe4173facba34ac11564fad931395325733371d7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-16-23-538Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4b72e10ae31d398814d146f0b50a6fbc2408dc6f28161df9face620836df98fc", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-20-28-380Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-20-28-380Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-20-28-380Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "014470d23612b75194349f624b24d1573d8fa8127ff384b8677cc1cf56f2d507", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-20-28-380Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "86ad3c2b2fabdd3a486ea7ff4fdcbac82016408303a0da49b21afb98c87dc3eb", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-23-23-912Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-23-23-912Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-23-23-912Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "75a6cc3eb066c902fba9807fa426d5a9bbc56941a45b9083d47f73ddfef58480", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-23-23-912Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b5d9859afb006094bf2225365a27066221ddf3fb610b25454ae6a2cdd731fc30", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T15-24-07-522Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-24-07-522Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-24-07-522Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "4674b9aed9a9cd2c430c7bc9d56f29a610d2a895d4c2f75b4c8beb56fad886b7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-24-07-522Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a527741c85eab33aa6a797972ecb734398a295c97921d851c419f47fd284776a", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T15-26-30-048Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-26-30-048Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-26-30-048Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "8b2ea516afc2684642a1edca14a774cd8c8dbaf1db7daa2f2c4a7c58c736115a", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-26-30-048Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cff3f20ad0814f53f7597803851fb9459a7a09da0f3da699e0b7b3d4114cb60c", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-30-15-841Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-30-15-841Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-30-15-841Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "63bab2b7d4c03079702efd732db8bdac239a321a165194a7122aef6a7b02a8f8", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-30-15-841Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fd08c584739e0689daf8b41549f6677f7172c48f1093f63f7b248a73d0b44b70", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-39-17-079Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-39-17-079Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-39-17-079Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "293c32ad807c69133c7dc9cc1ac2e520d96fb38dee11a226d59edc382ea71626", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-39-17-079Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "caff37f9fd318e8a554e732599448873365d96881f4668fd68f20147c2d33bb8", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-39-35-722Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-39-35-722Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-39-35-722Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "2a482007c755bf15c610812887e552b2ce9aaa6fbe76277e153dd5fffee4cffc", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-39-35-722Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "edb161d047ae34291f43a5fe638ca0f8f1dd238764356be28f79fd7fa8c86db2", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-43-33-466Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-43-33-466Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-43-33-466Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "f67d8afad81314e6919660fa9d95fdc5f6c2d5ed095d4d22e66f9138112168ab", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-43-33-466Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e8ecefcb7bf6d1d4a648f3a9b4d47e8574fd17c6bfb5f6f795ea13e048d37b4a", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-44-50-861Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-44-50-861Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-44-50-861Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "7619c957ef9c075215a9915aa89a282a6aa242e2bd8b2b775e076a7d5b445164", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-44-50-861Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "49e21e441aafc9a57e202e0afd6a2dc8370361a25fe114b1a93bcd6d527db723", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-46-53-144Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-46-53-144Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-46-53-144Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "110fb25d924cc72f6c61198bc4a4bf7cc14b0e5b783c97a32f0304e5667f3938", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-46-53-144Z-jesseduffield-lazygit-low-github-owner-casing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "31163afa09d1a7f393447bfd832bccf5d66f5c8318a41c14d284399e4d94962d", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-51-08-930Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-51-08-930Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-51-08-930Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "5f8b9aed3ce432186928535304d06150c0a11b7193d9dc813d988d79f44c49c6", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-51-08-930Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3e865458363a7bfec81b59dbae1807457cb1d7dfeadffd6851f5e3114a4eae9b", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-51-36-407Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-51-36-407Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-51-36-407Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "9ea3382eab1db7ed20b5fce10145f59355df4dda2044b9554a2efa75591361bf", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-51-36-407Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ec677ba70b7485a2f95e713778ce73067fdec6630ce350563f662150182939ae", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-51-37-213Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-51-37-213Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-51-37-213Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "80a528ae691b346399ec92839cc1271633c707c95e3aca6b9a20a7c3c47a91d0", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-51-37-213Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "dcc4fa44ffe3e06b409cb9ccd98fd935ed49ef999b8078ce039df64c759d458b", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-54-29-654Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-54-29-654Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-54-29-654Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "f4376ffcc05f6612be1c4b63000e21a534bd8448fb667d7f7828dc98b746b760", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-54-29-654Z-jesseduffield-lazygit-low-github-owner-casing-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "64accb343dd7828940238f2c8debfa193bad4c3cd7ec8e22ebe360f2c47bbf9b", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-54-58-304Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-54-58-304Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-54-58-304Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "56cea117d4b85061d4d241bf9395fbe3cd6fccf2b0111d3e91f8a905e2a1b4a6", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-54-58-304Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3c32441a16998015097c9c02e545e0af4f60666ef3f60ff9c168345ad238e888", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-58-10-239Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-58-10-239Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-58-10-239Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "2652b00f2e3a749681ca6543491ddad2f31018b3f5a52de6c4c9750539e2fc02", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-58-10-239Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d53126ecb36ff6e54d1b3c4d91bac7c90d79ea3ef2e912928daf8ce9d47da0a0", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T15-59-41-373Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "fastapi-fastapi-mid-jsonable-encoder-color-types", "case_path": "/benchmark/cases/fastapi__fastapi/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T15-59-41-373Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T15-59-41-373Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "d12a803fbc6fff92318b5164b821f728774ee3c481a9aab094d4754a92467521", "prompt_bundle_path": "/benchmark/runs/2026-05-04T15-59-41-373Z-fastapi-fastapi-mid-jsonable-encoder-color-types-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0e518e0a861cc182daa5d4f2a0a8150967fdf7cc033a0ec5a791aed6fb992570", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-00-20-664Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-00-20-664Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-00-20-664Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "e45bcf68b5ca78c1eaefb685997e508617a491e1395cd91934a44ed4ce01da02", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-00-20-664Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4291cb573c0bb1d6972c8978c78ad55a673ff0943641b3972ed9d3248ff2f115", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-06-39-472Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-06-39-472Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-06-39-472Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "b4dab85d63527078eb6687bbc6e899e7e54a5a0788256192a606747a79800d2d", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-06-39-472Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ed3373a9f7cd739dbee5e00cc3055d066230f714e06545f0dee6a0c0391a9e60", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T16-11-05-512Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-11-05-512Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-11-05-512Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "fb7bc2ebc8516d8a5015b54aca62c84faaecd20ddf00b89bf594f04819e334e1", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-11-05-512Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bc376c39250f86eaa44e074647609b64698184738579503f27b95a4c0ecb0c03", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-13-09-864Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-13-09-864Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-13-09-864Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "1ea9e0430b77820554869116c2fc024ef7bbbbe297a264b2899a7c71aad3aa21", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-13-09-864Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6dd55fa78df624a1e1935ee43c1e4dce3ad45044b45763f3877ed13145845e2c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-20-20-819Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-20-20-819Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-20-20-819Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "f3d51ab96e04840ca19478c7eb5d89373096141a3dce1495ecaa3f188e938ee6", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-20-20-819Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c8ccbfb134bca51df2f3ffea8c7823130de26acf3281f9f5d082d7d8c3bf4e05", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-21-22-192Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-21-22-192Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-21-22-192Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "9b03a7fed286337aed5cd9acd8217eea3614bb29928db4789585b18636ff9260", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-21-22-192Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "917bea938551b3907fead92f28d56358972f34ace0355dc470e38e2259268798", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T16-30-27-942Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-30-27-942Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-30-27-942Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "539943e6a843bb8ac9c433d5c5ab5bc0607ea8d05363fe86e70248c5bb77ab47", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-30-27-942Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "99f9bfd7aeb9f30dbbb08404a39b006aee53f0fb4f56b01fb8249e77dedd43d4", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-04T09:34:28.535Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-30-38-097Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-30-38-097Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-30-38-097Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "794922b2d23d9b48ed9a55b9b9190423c07b7e1e3f7b480ccdb2b865ba054200", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-30-38-097Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0c10a8f1818b036b91f5083152599bc45c1278df6ebc478f1f49ada42a6b2e9e", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T16-38-22-179Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-38-22-179Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-38-22-179Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "0140c61aae8dce51f35c54ea92ea4009b7ad551e7d5cce6c1eb5e5351830cbad", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-38-22-179Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bf34605d167c779b319da819a36039484b549500f86a3cb3ee0b278ebc10a798", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T16-45-14-912Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-45-14-912Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-45-14-912Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "62eeeed71e768e32191b9a6ed29702fc890316d2ff05d65cf5723a18d20bd973", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-45-14-912Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f70fd0699c89a8b3b97f0269ec9233a2f9921c5e5a2983a15021679f9abb3152", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-48-34-392Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-48-34-392Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-48-34-392Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "8e70bbe3cc24dda937af07e54d10e9b0854275852db827e1f40ea55f0d1d5f62", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-48-34-392Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8597a3d56a7c5242611abf0fb75cb53e47977802bdece98d7ae1ac5a64a2a162", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-04T09:34:28.577Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-04T16-52-14-191Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T16-52-14-191Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T16-52-14-191Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "c812630bb078ce34bf91ed2a2c05fed1339aae15c84d7e5c022612ffdd9d5af7", "prompt_bundle_path": "/benchmark/runs/2026-05-04T16-52-14-191Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3cba8349a03b52d8b761117f3a13614ef4fef80218b7eba93b0a5c54facf4d15", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-04T17-02-57-168Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-04T17-02-57-168Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-04T17-02-57-168Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "605c3d4c13bbce929b40cdf0e5d71058240b4ade123b85f25426a988aaca90eb", "prompt_bundle_path": "/benchmark/runs/2026-05-04T17-02-57-168Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "af8977cffe692c875aa9437f2e6fb0c03d5798211d7891cb4ea48abfd06824e9", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-04T09:34:27.948Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-22-47-152Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-22-47-152Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "c2b1c9abb755b7254b82451ac80830ce458a01a96da8dada9da6fe40f9714365", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "253d718d48753247a411bb9c1ccb15a50046fdc9cb1d6fb0f5e289dd27a79750", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "000f3bb6800b7c039003dbac5c1821815be3928b5cc3536fb8c448c29f3b51dd", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7a4514081297dba5de9cfff5e645be5418e0a255f0e2eb9f33fc51e51a174884", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "8f69384ed32a565aca833c4d802eb98f7933d65f063dc7c821f0f31bc9b7fa56", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-22-47-152Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d0532dea0e06eca4e788849327b84c993498dcb844a5655738bc73807f2a4b50", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-41-40-652Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-high-compare-no-common-history", "case_path": "/benchmark/cases/go-gitea__gitea/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-41-40-652Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-41-40-652Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "fd475f452cc4fb80b6644b5b27425235593be8ed974f2333e86bd41b4a29fca0", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-41-40-652Z-go-gitea-gitea-high-compare-no-common-history-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5169ef3407c45ed2cfa907c030b383bd3b577ce65720810afb319dd2eb8eec16", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-45-53-745Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-45-53-745Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-45-53-745Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "8c2f3e6f1123e7fa389864dfeebca372da85feba53a794e816c31da12caf40d6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-45-53-745Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bd47db57b8eb64bcda8d769a575c767de3b18e707f5299454c39fe24d4a0346a", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T09-46-37-703Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T09-46-37-703Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T09-46-37-703Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "6d892efca33c8937f7884a94cb14e1d3986286fb235b73999a21eaa0e1181308", "prompt_bundle_path": "/benchmark/runs/2026-05-05T09-46-37-703Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0e7c8b4bfb9613071037a3b6c03fb743ed3407f5ec559d6db3ec5e2ba4b5cd4e", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-08-25-022Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-08-25-022Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-08-25-022Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "dc3c14525602086ceea8d9a87d02f5eab9d6a2c73020123938abfd23cbbc4d2f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-08-25-022Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d399d817b526461a6b00f2fae5943827c97ca8b51a5d492c5d9e82bd1a7118ac", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-11-11-097Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-11-11-097Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-11-11-097Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "e8c3cf255df1eb031c4817ac26ad1e5ac3ec20d2abfba77957272074f015ea28", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-11-11-097Z-langflow-ai-langflow-low-loguru-file-routing-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f1c52875ff5a7d083a2029a71c899daed1778d9a1b243314ea0b050c3e90b2a9", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-20-26-130Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-20-26-130Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-20-26-130Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "b9ff3775ea7582dd6116c991653d4a843ce42525bf1af1e0afa9dcf02a4f0d21", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-20-26-130Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "502a43a8a57b50fe0f59c36d0f7b2c1ed9a8b1c3267a7985fc268194d4cf0a30", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-23-59-829Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-23-59-829Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-23-59-829Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "439739e67999be6c5858f3debfd7237f2a26857ab33e22d3f1e99a43512628f0", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-23-59-829Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f0662fd83938dfe17e39a6a62d3c17e34f1d24afc4ac7ee716705ae1a9ec2b19", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-24-38-826Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-24-38-826Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-24-38-826Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "95b9863e38344e20b21fc483913e64e6053ffd5c069745885e857fc0a7ac0ad4", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-24-38-826Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1f4314615bef32747cdf5339a96506ddb2d68e659c37e05c988cae689210f26b", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-26-27-994Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-26-27-994Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-26-27-994Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "1f5ccf62d9327643964a88481b8de783456b07481f61dbe4e13f52aeba2197e2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-26-27-994Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "acc16a6817d29f66f3b89b7b97dc1cfef2e76521902c6e4880177613f219dbcb", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-32-24-968Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-32-24-968Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-32-24-968Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "f530a88249557deb4ebf02a335b104b8a88ce6eb76006e38a93cc3ff356a5e27", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-32-24-968Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6adf3924bd822690bfb2a10dfc32ad8def2fc2322ee0867d91b26fbf72877962", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-42-06-619Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-42-06-619Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-42-06-619Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "84fddb5be7a6304d0813ee5c46cac5bb103078efc651541d184aba3cae9b6191", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-42-06-619Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e7df367f986c7c99d70a37d30b33c807191886bb4ee721581218f58b68368df2", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-46-39-868Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-46-39-868Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-46-39-868Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "5873906a4cbf33439ab99592de900f0be01e71d8e8d79c5cf277ad8cd4e7b042", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-46-39-868Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "252afa65005d05ea6af457d342a6568faa0bfc3c2d4bcd32fc22607fff9ac8d3", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-50-15-481Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-50-15-481Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-50-15-481Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "591349b8375ee6ce05892f03a0fb0b7cb809d7a92d66e488cb8598285b7635e8", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-50-15-481Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "be68e686e1848fd0a5432848389e3939bd99c7f8a75672bde03c10fbcdf24216", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-54-29-064Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-54-29-064Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-54-29-064Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "03b8b59d5629e0d7f7489dec33f9ca029c60ca1a87c7b1e60ebd7afa09365107", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-54-29-064Z-langflow-ai-langflow-low-loguru-file-routing-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c429f91418c16e37e16f6c29ca20dd8c26653e7cb1d1ccfc5ece3b02e08417f8", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T10-54-50-340Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T10-54-50-340Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T10-54-50-340Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "a31cf0c5a965f0ecd98412c1724acadb3f6f95f924e05704dca1f306f4121f9d", "prompt_bundle_path": "/benchmark/runs/2026-05-05T10-54-50-340Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3be1a3bdb5d1976be7273ad74760b4f8fe74fb508d847f5036492c0a24127cdc", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T11-03-29-404Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-low-schedule-null-payload", "case_path": "/benchmark/cases/go-gitea__gitea/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-03-29-404Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-03-29-404Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "1fba9b9e8dc05e772cfed64288fbd6ccbffe85a39cb844827a3db655a956c39e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-03-29-404Z-go-gitea-gitea-low-schedule-null-payload-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "815dec395e87cbe1d24905a03962c806c5071e5461cd21ac8484e3afacc94423", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T11-06-43-273Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-06-43-273Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-06-43-273Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "6195fa6f6271b6f5ca26ed0629760fc770cf6def12069a380de5f2e49ef3363d", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-06-43-273Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5f18ccd8250daeebd102de44bfeb26b0d7ac9bf72454013c3b40e60395c4733a", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-10-39-066Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-10-39-066Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-10-39-066Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "3a177bb25ef3b6cd87ebe1e0d69934d96f9be7787342b5ca670d2c2ee3a191fc", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-10-39-066Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "94cc10150c98298b62b7f9ac84677cfe35755c82b8fdff9caf1cb4ea6b35f930", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-14-15-105Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-14-15-105Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-14-15-105Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "e9055c8cdfa432448664facefa54c779cccf6ee356271a8bee9f63e32426fc4a", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-14-15-105Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "75a18da594a71cfa704c65d24fc9a6d0f0b90969eecda598f3bf4021041c4409", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-17-10-462Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-17-10-462Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-17-10-462Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "61e7ac212e209b8f2e8c79ea7a8db94a58d5c9bb6595894dbcbfcd037ea21573", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-17-10-462Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "081628fdcc59c4c83b538993ed73774d3ac625d3fd3134dbc96caca012bcdb18", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-19-57-580Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-19-57-580Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-19-57-580Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "2dbb2817a9e70ba94b2fd9d53b4854c65b00a40aad404b0cb8591ac791b86def", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-19-57-580Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ab5455e46584a0c0569f131bd2fdc17b456026adbd7e518cbd67b8f1e5d736e8", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-26-51-345Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-26-51-345Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-26-51-345Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "876dd12babce08f1d826599efb247b05833968c18849eaf82ad5135bdaca6078", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-26-51-345Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cc486eb6488ef5e727403210a3c1febfbb8282082eeb3762271905022e7289f4", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T11-30-21-048Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-30-21-048Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-30-21-048Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "107ae9155adc8bc3bb24d05c31df4bdda8918a006d748484d35bca23e1f1ef6f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-30-21-048Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "dbff34b1b66f669672ec9eccdfae5e61cef92c0c88546ca506c64d62eaaae3c1", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-31-49-698Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-31-49-698Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-31-49-698Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "7dc5b2637f10f79add8819d2482d46aa35f68c60566ec8946df9e625e73e39f2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-31-49-698Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "01d26a6d11107e26e2ec841d936a37c35a72d8c9c369c2ee892ccf6ac464367b", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-41-59-059Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-41-59-059Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-41-59-059Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "947cd9e8c309eca9ab48f1b1925f4d3f0c7e97d469260cca9d8a67ad9ffe1468", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-41-59-059Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f9c61c0b73c01e57889aa0871d394c68d602f74d9350b195626e1ac527c979d1", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-44-57-211Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-44-57-211Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-44-57-211Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "446065df3eec75071080574cdd0f08f62f6ae9a6ae624bbfd3eb76e1849a8444", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-44-57-211Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a5c05e7f95f0e65381bd7796f6ee5bbe08f3c2240b1971b350f7ce465792fb4f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-47-49-234Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-47-49-234Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-47-49-234Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "fb7ec2a4db4302abc147f14a597e95c5ca447e1da044e08b823a12fec7c12b63", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-47-49-234Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "91fe797da09deed2c043218676add0b7edb696b1c5c70a304483060ef3f89c0b", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T11-56-21-380Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-56-21-380Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-56-21-380Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "aacc494b9af348d8c67c716176e0a49e512e93ae7b04712c235ac54025b0a7ea", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-56-21-380Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c191504fefa52d950f2cc1c13819f322df4b84808c2739028604bcfc0aa3d1f1", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T11-57-02-039Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T11-57-02-039Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T11-57-02-039Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "5036fbc277b4b3da7167bc42e6724e6db2e4d1bfc0ef47cee884cb4ae4d5c50c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T11-57-02-039Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "30eb16fff5ac1c6129bd2d9e27715e3b1ca7940edc3e493aa4cecc42d96a3eb6", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-01-36-081Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-01-36-081Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-01-36-081Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "ebabd33ffb379855822cbbfdc0b88dbd007dabab13ae1023687f84e916a74369", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-01-36-081Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "51c46ff6e6a8260cc0d22745e8520750ce1d22e236ef2da9add1209530ba1587", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-03-36-130Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-03-36-130Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-03-36-130Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "46e3f2e8af3d618d2a4ce9368720636e62a366dce85d33490298857e3c079bc8", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-03-36-130Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4a6c52770a86bf7c4e6b32414b3a7158d6d48f3bf41e4a33229c7349df220521", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T12-06-02-735Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-06-02-735Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-06-02-735Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "b746288ab00df69f1169110daebf62e73d139c5cd58ef53cb4dbe30dd36a0c41", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-06-02-735Z-louislam-uptime-kuma-high-websocket-auth-options-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "96ad4f9b99d26a533228904ca287d51e31569390f9c98d5c59501ddf45674ad8", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-09-54-292Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-09-54-292Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-09-54-292Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "725c5d8b9c08229911e4a19247759867c9c047de74e06b705268d1d9ac8736df", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-09-54-292Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1f058029a994c7e90a743ebc2ba33668509e99b40fa412036ba489b1aaa8ec2c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-10-32-936Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "go-gitea-gitea-mid-pr-merge-self-reference", "case_path": "/benchmark/cases/go-gitea__gitea/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-10-32-936Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-10-32-936Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "12c47b796d3f66232fad76f90a6c41e4f539498405ee5fae432d189e0ee347bc", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-10-32-936Z-go-gitea-gitea-mid-pr-merge-self-reference-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "715a63db9350038b1690dce6ac37ac2fdb7fc0519f068ec1e1e2342329e63dad", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T12-14-26-504Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-14-26-504Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-14-26-504Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "e61ce682fa0e31c22adffeb488e86312b160b7bd181d2bf3314c8502172c1a76", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-14-26-504Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7e6a73d2614ca396f78000a402f6ef8c521244264c82c470b8298a77bd89a4ab", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-15-29-518Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-15-29-518Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-15-29-518Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "18217ed4cb281acb5ac2ea6a5161cde3d9a30e1c40538999b401b5be18cd2335", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-15-29-518Z-louislam-uptime-kuma-high-websocket-auth-options-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9900056aedaaf8f1b17cd9138b3473f8c05f00ac5c97d8bfa4ac1fdbd49ade0c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-43-58-228Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-43-58-228Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-43-58-228Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "b1a0447b657360476531230ab7d23a75f628e351f019319d824bc5a460306dac", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-43-58-228Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ad8f24aacf1309add90960bfc603682b529c86b72e0e0ad15d2a92d14aff163e", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-44-43-035Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-44-43-035Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-44-43-035Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "2c733b6fe65b58f12b5faf4d42bd7b1ef9a01470c5623bc501b0bd8b4249872c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-44-43-035Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "21cc85d9ad25b0d44f86189ffcd1817a7cf4d236507c19f38d5b15e5c07810b9", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T12-48-18-861Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-48-18-861Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-48-18-861Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "f2e079dd5b5aea7b1f621d7c858b11af6ef71a6e1ff2179b9ff372b02b3afe8e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-48-18-861Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a2f098c39ccbc4d082cb568eb5a725c9e572a5c58982ca6e236566199c333f34", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-49-19-741Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-49-19-741Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-49-19-741Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "2aa5b8e07500291089c47720fd69b6a8c2a92db6ec0a1f94040bbb4282539e09", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-49-19-741Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e747c9014643db7458bf080d545ea3bba5a2ef89be19b951b24a170ea93c6659", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T12-53-48-830Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-53-48-830Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-53-48-830Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "7e474402d07038a46425a6ea3abb9d2f2c63f3200f178ad5ddc365b8aa186433", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-53-48-830Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8b9d37197b5f7f56327e780d8d20e80f98ec1aa7d208dc063a7fff2829bbc2f8", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T12-55-07-132Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-55-07-132Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-55-07-132Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "1682cb7899e5ed68cf35503d949351378d40c22e14c229a2aac965ef7a917aa0", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-55-07-132Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0d0621f45b68278c9d4190db8387b208cc8d402a67059b328c2e7f34eabe07ae", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T12-58-43-181Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T12-58-43-181Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T12-58-43-181Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "ce25a66db45bbc81e619346f18360bee7ee7ef2c830d51919ef1cc0090c48588", "prompt_bundle_path": "/benchmark/runs/2026-05-05T12-58-43-181Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "770c662f93a69c756fe258d18e08f1bafa37cde7d15ef2db2eae100d61d839fd", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T13-05-12-680Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-05-12-680Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-05-12-680Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "800825d141731d3d124cd7b1b37a859bd20cc9ebb41b7ad20d48e88fb90169d2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-05-12-680Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2348a43767febc580ba7334a36265985cf05698ac4f4c158a09006fc3e00b529", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-05-53-131Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-05-53-131Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-05-53-131Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "a396da3b7b582465dc0fe0c0674efa916a5fc6e75650666a8498755578017d4c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-05-53-131Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f23b437a24260ac5630daa2f5852bf830fcf8063dca211af7e1caadace3f0e91", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-06-18-273Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-06-18-273Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-06-18-273Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "e7be582ffd76882d8ce246f45bdc7a96fd85384e9ea1e1e45826145db52ecbb1", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-06-18-273Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "42d81999fd1483de4619a807428a698131753171d3703f9f757ea8e9e988f35f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T13-08-54-906Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-08-54-906Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-08-54-906Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "63b93ef99c65c1d2783fcf3e9669cf5f142836e98fb6e676fa6615ec89e4691f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-08-54-906Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6ae661bfa2f31b072980b872d27800f2e0a9accd2b8f065b1c2205192769cde0", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-12-53-496Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-12-53-496Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-12-53-496Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "4be7a8e8d4345aa0e408d3c05324209ebe58df99571b0db721aaff959ac22334", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-12-53-496Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "df344919f43acbdf48289d0685391d8dcede45db121ef4a0a003bef9bf1dc724", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-18-33-970Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-18-33-970Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-18-33-970Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "eb3c992659aabfaa734ce657de0f0d819eb0bb1faeeca7642285ed0a4878f23e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-18-33-970Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f27d0ce16340edb5615be5970083af828f3e09c6f4dba52b373cb87d27538022", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-19-33-906Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-19-33-906Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-19-33-906Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "294d60f8bf95d6d5d4bfa0cef061d87af240ab6ba908522690e490d42ba8f6ef", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-19-33-906Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3c844c5b8eced43dc6e72d6c84c4fa50b9e7f901b903dc704a873e229d76e02d", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-28-27-811Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-28-27-811Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-28-27-811Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "b36e291d3e5d72d6a007a94ef22c1353e5584ba683c210078e29a2347f7ae0e9", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-28-27-811Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1910c73d740e907afea676a706bf549499308d23138a1a7c4e766a0c487b9629", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-30-49-989Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-30-49-989Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-30-49-989Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "c5b38374d84ba218378cd6c4e82010cb6c35c8a587e68aa09f9478d2673501b6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-30-49-989Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "68871b0b9bd1809dff99f1230297a29c99f96f3e549bac62160b89dd0f161220", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-37-08-286Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-37-08-286Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-37-08-286Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "a908e8f2005a61075970f6822c08669b7fbfe1d4293962a3960aae9b75395555", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-37-08-286Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e42e438d6dfaed60328c212ba4a6ceab019218a96a73fce6279902dff52a5d06", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T13-42-41-836Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-high-branch-divergence-fast-path", "case_path": "/benchmark/cases/jesseduffield__lazygit/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T13-42-41-836Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T13-42-41-836Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "4f448fc0f62a3cbb97affc04403341d35e9e799b28a25a46e4c54e0188fd07d4", "prompt_bundle_path": "/benchmark/runs/2026-05-05T13-42-41-836Z-jesseduffield-lazygit-high-branch-divergence-fast-path-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e8bbc7fe7527e26938fee0e889a90216c738647027399b7af22e09b61289c4bf", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T14-01-02-636Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-01-02-636Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-01-02-636Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "7a86fafdadd17f4519ba5016b27c426087e0e933c4da7b9d0b89066aa48a1d60", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-01-02-636Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "991d36fb98f5ac5ef4abce492d8fc019cbc40083359e0240c0492d8e908f2ab6", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T14-13-00-897Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-13-00-897Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-13-00-897Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "7cae10d97f193494f4ba6d96b072dd5cac43696831d93e69f469b3e3c0dec2ed", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-13-00-897Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c12893d512c2cb5a2a9d5da818f45366880771d64b3a9ed92d28a12cfc7bb197", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T14-13-10-650Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-13-10-650Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-13-10-650Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "5b135678939e5ad621577e7afa9d500a5949bef6699d8dd10215bf21f5c88e16", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-13-10-650Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b1b06eb38a0718f5f1ab358ed6f93cd065946e3493b4f73363c2b83aeb108e99", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-15-51-021Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-15-51-021Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-15-51-021Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "4c3f3938745ee94202ce17cef256ed8d8b1cdeac2e4f0b868d561ad4d6dc5c81", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-15-51-021Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ce5d2bf32a9ddd7487ab08bc810bc7bb1efaf66e0d4b300ac7bf643f4c3f2ac0", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-18-54-003Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-18-54-003Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-18-54-003Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "3a36ba0a59aa9ce510d4eb780fdad0cf22b5ee9cee124234463a583cc408ee24", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-18-54-003Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "84792cfcfa61e40363a3ee9fa221fa04dcc9933f688b69b86592f9b4800bb9b4", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-19-48-105Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-19-48-105Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-19-48-105Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "866c2fee992cb35aa5092550c049adb92577a7a5de4b27e9e47ddcce46217c66", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-19-48-105Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e6d5fc1996b9823d86ddefc82604e08ee3982996040f15c0d01081af02420ea6", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-25-29-615Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-25-29-615Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-25-29-615Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "1bd702fce4ff084fb1eed3fa6ab4f9b3fc413ef43a601678a2b5013857dafdf0", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-25-29-615Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f1b3ae2ff7d69ff5dbb9da51e3401240d748c5362ef2102f79d094fec372783d", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-29-22-706Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-29-22-706Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-29-22-706Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "bad82d42397140b7003e29975893eed93c87d62501a6be52639c9bf35b74d178", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-29-22-706Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "94db4d0db26502334ba5d50645349cfb22b6bc7a97e85fbaccc06d9df7069735", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-33-02-697Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-33-02-697Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-33-02-697Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "b83d172834987a5f6a44b365ddedf4a80e20e8ebd6b0ede90cea4e07653401f2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-33-02-697Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "78e01bfdf49ef42044f387999a80d5725c1fa675226371475576ccbe7e0fbecb", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-33-41-071Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-33-41-071Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-33-41-071Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "a95846ca1a56a5ce4c41674e1ddebac58277372780a8a71b1dcdc329e46ed676", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-33-41-071Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d7c12df86cf1dc8e4405712780449144c595f92f7e5a095eecf6bcc903b7cc2b", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-35-44-616Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-35-44-616Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-35-44-616Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "96f3438b0790a5e6c354d0d0cb415f6c9564dc1cdc1cb1dc18cd0405714be03f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-35-44-616Z-sharkdp-bat-high-fallback-syntax-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "daf3a5fa2a9a7970066803258906ecd3139e12026d55d1958ba74306e758afc1", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-38-19-267Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-38-19-267Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-38-19-267Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "ac04f10bfed8ba1cf5a97279148f47612a5ab4ce1276f9c8616d6197dcb1b5e5", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-38-19-267Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "95f29717459b515bea0b8b72ce017ae32720f28d432fda1473488a5ec1623082", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-43-00-831Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-43-00-831Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-43-00-831Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "e4391657c03568f450dc98d51698865febd67b2fa086489ef882afbc3ac6754a", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-43-00-831Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "02417c1bd542ef7821922651f68941b1a8526d68491fe4f262af3c118034c82c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-43-41-773Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-low-github-owner-casing", "case_path": "/benchmark/cases/jesseduffield__lazygit/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-43-41-773Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-43-41-773Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "6e2ff8f69f7e0ef6c6affe1efc77e1bc77d9b4df1f37fe94b1fbb7519f64194a", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-43-41-773Z-jesseduffield-lazygit-low-github-owner-casing-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e22dd6c9d493d0f611754ed9f7b7a8b85bf4ed5755494b807e49dacd10ea1e3a", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-45-04-732Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-45-04-732Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-45-04-732Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "bb9da3b509b269216c413cd6bd402519c67472707b2336afd31c687349e52b7e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-45-04-732Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7b0ff3404df7411679a556f61550498c25f6a216ae08e8908f49ccbc05a813c0", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-51-45-838Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-51-45-838Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-51-45-838Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "f4fae2057fb026825e4cf7e4614e60ca7e1f5961a9ed902444fc659808b8f5d9", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-51-45-838Z-sharkdp-bat-high-fallback-syntax-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8b314bf73bfac0b74a4a1d02743a8c383430469f82f08007e479e8abf18aa474", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T14-55-06-612Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-55-06-612Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-55-06-612Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "00441abb0b94873061c9c14f2b35394774b24f69c331f83d1b8af71ab9599d87", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-55-06-612Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "80db59aaf09318da10ee82965420c9fc90857fd39e04552c2d50bdb689b02656", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T14-56-01-600Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T14-56-01-600Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T14-56-01-600Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "a87847ad10271b5e7bccd7bc49d0a2866327b6e48b15257e9e000d4c6e71c4fe", "prompt_bundle_path": "/benchmark/runs/2026-05-05T14-56-01-600Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7f5fb875b04bcbb317dd63a0b0e5c624531620b0152ccfc61d1ff738df4a1489", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-00-45-916Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-00-45-916Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-00-45-916Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "0a2b26ad38445f547c99e60b134199457fd2308b68d073ecc142f3b5b9c64c87", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-00-45-916Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "214d423f16a73442b2dcf00f1e629a85142c817fc31e89c427ce60950a2f0240", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-02-35-943Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-02-35-943Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-02-35-943Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "a0340a5c0702907d2af464f58d4db4c5281fa0a384a6db89cfff6a376c15bfd8", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-02-35-943Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "35c5b2076c1cf7dfe05befffea3b6bac50e9b0ba37896b4bf195eb776d8e98e3", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-04-17-696Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-04-17-696Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-04-17-696Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "c28e58b75f2352d6fb432a230c6940b8c28a36a5ee779663460375dda29964ec", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-04-17-696Z-sharkdp-bat-low-zip-binary-detection-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0a55fa1050fb3c51ca1792d3db0572719e183b792935229682f1653860ae0588", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-05-47-369Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-05-47-369Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-05-47-369Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "64b3beea13d846a0127441e5763ed2a47fe58d4a7a8a080fb0e49d443324d868", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-05-47-369Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ef9840f76821f95a8e89da33a88b2a722e8104a513b0bb388d9521681c4c4e6c", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-09-39-765Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-09-39-765Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-09-39-765Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "a04f16862d403f1b883179b2b71b2c60a2a5653dd8284fb50cfff87457fabc17", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-09-39-765Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5499c9c0cd6ff917789298c5e67e9deb46cbb286ca0e9b9d134b6a0e1665bb87", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-11-59-164Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-11-59-164Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-11-59-164Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "0f8c4dc9445111fe2e4e5f09c6c3dfb84a6b3fd7c94cefdb1e7f5728e240fb01", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-11-59-164Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "41fe9d74f25e23e6703239051dc6adfb9557ceac7a85e73397be0582eb07bf51", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-13-14-829Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-13-14-829Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-13-14-829Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "99ea2b1123e4966287ac6219913bcef6d321072f31ed5f255bfd76edb0635235", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-13-14-829Z-sharkdp-bat-low-zip-binary-detection-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a0c3d2bbe2d42a8a511c4d5f92407875ed0ba0e8925534fdfcd9f3fa7ef8df2e", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-16-20-576Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-16-20-576Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-16-20-576Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "41673bb53a3df9cd0fcc997eed5c63f3809a4b8ff89b67aa17e273eba627166f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-16-20-576Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fcb702852cdf66ea3d6b36635fa9eaaa998bb88fbe6cd3f0431fbae5af58ac82", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-24-39-777Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-24-39-777Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-24-39-777Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "70881b3b81d89eddad366fff7188a8a592e4b59b21751d7171e44026af351390", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-24-39-777Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5c661d5d7cfc519a7394e67dff5bd48b52c65059d329e8f41e1d7b2391bf35b2", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-24-53-223Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-24-53-223Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-24-53-223Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "96db5d51ec3a7ad790d0d2e29685cc338258894d66e150676d57c07db4420920", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-24-53-223Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a0510f2786a5ac5576cbd54c9170c402b3a2a1394199bdf90c4b2c7649aea2cf", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-30-57-381Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-30-57-381Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-30-57-381Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "dfe2a01abef60826dfa76e7c3917630fd6729f72e274195b68753d92f87d9030", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-30-57-381Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b8858f4e0c31ae0aa373bf0b24d25b7bafe67603a28827186f14b32e309ff0a3", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-33-30-914Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-33-30-914Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-33-30-914Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "6af9ffb25cd023d552ece6c62722cc3c9a05232991df0d059dd8c7c24e3f0a0e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-33-30-914Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "79d8724cdba1c6ace13a90169457649b000e125f06c9d49df5a8c3c5bd288145", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-37-20-318Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-37-20-318Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-37-20-318Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "b47a750ca96aa84a2b3b5f409b7c81505235d36a576221a88e0f87b16cdfef40", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-37-20-318Z-sharkdp-bat-mid-control-character-wrapping-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b53841f07907bf5126abaa1b88c463492236289eff3be14bee035ed55155bead", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T15-45-35-220Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-45-35-220Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-45-35-220Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "1deac61573d4d65b6b5175491e6db593e8138418748cd224c6874382bbf20b04", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-45-35-220Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "199caaecc2200b795b5beb8218f63ad9e5c65ea606270cce6645a5bc3430d3aa", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-47-31-640Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-47-31-640Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-47-31-640Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "f794244f26dea69564e183ef6fb6f0acff7f9121eedcb1f1d70304e84513d366", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-47-31-640Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ad901724ecd1d073fa2a618f2522555665f08bd8385ee97437757278d047fe46", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T15-56-10-889Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T15-56-10-889Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T15-56-10-889Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "656b1399eecee9dc76775a4a3d3f2b18fce23cc54307525742977a23802e873d", "prompt_bundle_path": "/benchmark/runs/2026-05-05T15-56-10-889Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bf26643729190137c8fb7ebc319af9603d77447c20c1cfa861f560683b928083", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T16-04-31-008Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "jesseduffield-lazygit-mid-preserve-commit-message-whitespace", "case_path": "/benchmark/cases/jesseduffield__lazygit/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-04-31-008Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-04-31-008Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "61875c5f525981716d5417ae66b10c41205d2f3a8166bad14502664beec187af", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-04-31-008Z-jesseduffield-lazygit-mid-preserve-commit-message-whitespace-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "376547028de82e40e02332093f4514cb78f65c83651b7bea4b702878c3742ccd", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-09-43-987Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-09-43-987Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-09-43-987Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "0d0b0872f61b74d7f0a50dd9e379286feef08e592bd85e748ed6c9ec4d1f0f23", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-09-43-987Z-sharkdp-bat-mid-control-character-wrapping-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a391d366ea6b74b61dd7ec66e4c544bdbd9fadd6d708c797d8a2119e53495e25", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T16-13-11-768Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-13-11-768Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-13-11-768Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "b4e3f2a176655270698ef51591193c44d6ccc5a73856c61230a1e69602f342fb", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-13-11-768Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7b631bdf34a83c3e405547707a47b8e899eb0e5cded689b3d54a6d1f871ed0d5", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-20-19-905Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-20-19-905Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-20-19-905Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "7de034deb37b88c219102559f7c0c08a2ce989b5cea27be51403baec0d1539b3", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-20-19-905Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3e36ecfc15e5b100a8f689c00f0232a87c9ee0768f29423f30ed55f5e9b7622b", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-24-43-943Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-24-43-943Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-24-43-943Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "c6ba94b6c00240117b7f43d07b3035ee6440d14d3ebf6eafac4c39ff3ee81e89", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-24-43-943Z-usememos-memos-high-missing-related-users-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "aa79966ffd0f892003c8dddbf6f4fccab13b693bf4a3181310546992a6fb16b5", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-25-40-691Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-25-40-691Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-25-40-691Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "170300eea293ec71742289b3965af07b0c3498bc3aeebcf79547f260788fe73e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-25-40-691Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "49ab14889489b5b4d2e495026ec55042a713736b9deaf3b7f84b4f82bdc324dd", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-26-43-473Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-26-43-473Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-26-43-473Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "3d8a07a5337f9ddf98891ae3db527981fdee022213507789d733604c390e21cd", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-26-43-473Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8658bb6e49586697e1822041aae176f5fb2c9061ae846577d466893a088b8745", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-33-40-279Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-33-40-279Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-33-40-279Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "440264868372e61a59b4a51f1dee9987a3d58fabfc88c75a458777b215f4a40c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-33-40-279Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "367d5b6f486c00faaba49ddc20906ccfefb7cd08988347e7b8b0bee4fdfab2a8", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-38-34-220Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-38-34-220Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-38-34-220Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "b2afe2c13291ff93fbc407d7b91b0166addf7ba15410dca0eeed4641c540c292", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-38-34-220Z-usememos-memos-high-missing-related-users-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f717c880af0bb8cca26dacdce6033f32426264661a1c5eaa0ac2d1b589153735", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-43-06-099Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-43-06-099Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-43-06-099Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "e5c4e39785e84a2d3634004dfdc21107be69ce99e4d6f3347b9ebcaac44fafc6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-43-06-099Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1fb5d37f0080ec3289b23984a14d2e4bf547677e33e4c7a513b08dff92dc462e", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T16-54-23-472Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T16-54-23-472Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T16-54-23-472Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "07076bda30fcbf2c0e2d1ceb43c52d54e89375085242d2150c790a69f6a2f729", "prompt_bundle_path": "/benchmark/runs/2026-05-05T16-54-23-472Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f7c6e75a27f307af6b3bc8fdd05e39535812015e0fdc35a9904f2d9a48c915da", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-11-46-123Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-11-46-123Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-11-46-123Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "4c88b10d4ad8016bded38f5a9ed4422889949a6d93aba2b8a34cf2cae0cf9d61", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-11-46-123Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b2db5f0552af72da4507c25fbabf946c706422d0b284c8bb5f693a1ac03247e6", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-23-00-406Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-23-00-406Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-23-00-406Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "2f32a0ce1d31d6b2e3bc192fd54c3eb7ab77c3f7f1ac9d53fa0ae2b5282a6796", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-23-00-406Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b3e6121f109fb197cb3cd73154d43c61f30a046712996f750ddcbdcde7e1f9fb", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-25-08-839Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-25-08-839Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-25-08-839Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "711016aa579484941287ee3fcb880e80f2d1eb9383f270eee57cc2c98f6e8dec", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-25-08-839Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bb11f9447e846804b2a3872dc1b5245f205c9648ceedda590399ba42cc2c6277", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-30-20-337Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-30-20-337Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-30-20-337Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "d5e364dedea54b7a049ad91dbe252892c23edd93df678fbee02def0b01e39e77", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-30-20-337Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b5a2681da156a6a36afa6fc2380dadcc2e7d23925f2e838a22d1477028ad38d5", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-33-02-447Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-33-02-447Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-33-02-447Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "ec117debfd9a4a86c37083c207ca241d03d4ccbab600e9eb57f24ec214915083", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-33-02-447Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f66f29d3e166b529c92d1345fef9639b5bd03ae99f92e09dcba3a33e00772b7f", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-34-18-335Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-34-18-335Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-34-18-335Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "4b8064777577b47608d678a8ea08d25b848ce6710f53c7ed30f87e18afc3cfae", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-34-18-335Z-usememos-memos-low-omit-internal-user-settings-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "44e6f3536e7bce82176eaf7283c585b86ec0aa9d600e7e1c2ce22c21c9d40c60", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-35-09-940Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-35-09-940Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-35-09-940Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "3f897c7fc40e3269b8e891ed3e7b8c992f1c1ee948a7a2f1de7552427c02b199", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-35-09-940Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9474fb0b1b7c1168c27178c3e8426b45b115407298a9297d546ffb0df8bae5f4", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-41-42-551Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-41-42-551Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-41-42-551Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "3b9827a336a878ec426a80bb276df262a651567392cf4eaf31f380e275af7263", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-41-42-551Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "af3d7817730e780f939c51819992bfc6f4bb0084e224ce4c2050d50ecdc2bf3d", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-50-11-689Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-50-11-689Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-50-11-689Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "00556afbaf1a8cc31e0da74d714946b5ec57b38d4bcdc549e76c20f1e3727307", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-50-11-689Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5a7c6872fd764f2d7d98112e6e6ccf69af1eb9ea94d986dae9519e33a1cea127", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-59-19-708Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-59-19-708Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-59-19-708Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "0cbc7c130ba517fbc067a4d81011a740acc091c3e38099146c829593edc578c4", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-59-19-708Z-usememos-memos-low-omit-internal-user-settings-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9c6deedf9ba6baf1b8d513bdc5273f0c79eacfb21c84123cd5b97b9a8539a0d8", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T17-59-20-147Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T17-59-20-147Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T17-59-20-147Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "667edd1ebaa81fe853ef454f167efea5430abfce659916e41dc258726d455086", "prompt_bundle_path": "/benchmark/runs/2026-05-05T17-59-20-147Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ad7e218d6a23baae28cfc48c4544094335515897104ac43bc01af864f2f39688", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-05-37-374Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-high-lfx-stream-fallback", "case_path": "/benchmark/cases/langflow-ai__langflow/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-05-37-374Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-05-37-374Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "eb7640d57b8471a09273a69c156b9a30bd9cf994b1ee214497ca5945080b5879", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-05-37-374Z-langflow-ai-langflow-high-lfx-stream-fallback-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c88c66dbe8cd4a1de5c183d4baabec9ca61cc2bf463a4dd981b65338a0876a30", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-06-57-044Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-06-57-044Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-06-57-044Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "dbdf253e6be9df30d1d57a29af412aa8686a8cbb8fc35adb4961749ac950ad43", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-06-57-044Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c805213d3a2c2ea04ea5cfb8bb16518320753a24d9d063786e223b43f23cf06f", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-10-47-746Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-10-47-746Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-10-47-746Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "959015373918ac675e08e8ddec4528c23d8dabaaa87b78ed6276720575d9ea42", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-10-47-746Z-usememos-memos-mid-mixed-case-user-resource-names-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "94e2c2daab274762d9ef9f0c8b512738d33c4641cda86df1b6ff1665f8fd1c5d", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-14-50-700Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-14-50-700Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-14-50-700Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "97a2df1bf63b32482a06d148e58924cdf33b6a86ad32e196e9c1b15773c57704", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-14-50-700Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6217682faf7e29caf07dbe771e340b487c32dea5e51707631ff679f998b4d2eb", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-19-24-765Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-19-24-765Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-19-24-765Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "54bf644a435d570fbf88e5e93ac4ab6094d330683101f94bfc4e420746363d24", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-19-24-765Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "617d1fab0e953defc6931dfe3c7d6fe3116ebb79ff4ca1426ee2cbe574e51745", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T18-29-27-975Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-29-27-975Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-29-27-975Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "fc315b264803dfe5b0b41d9a13c08a61b2bb66cda2f43477d5f1e697168af875", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-29-27-975Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5c29c2db52423bf55c7fe787bd30881feaf6dfcb97180ad9d27961c12bf13bad", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T18-40-03-541Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-40-03-541Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-40-03-541Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "e9294b84d30f05a809ab8ab60d622c7cae4eb07c0688d22f87392f250b78e5e8", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-40-03-541Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "33d58d5c0eaf19114d2e40ae32bb6afe42bd1daeecd2a55d7a981d1a70c414dc", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T18-41-35-514Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-41-35-514Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-41-35-514Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "573fb2d82cfeec66d907530474d1990786c982388f6756a6a046bb586c93dbc6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-41-35-514Z-usememos-memos-mid-mixed-case-user-resource-names-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "831c7349b0f28591773cb2372dab05123fb0bb3c1703f7c518a933403edd6401", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T18-50-09-643Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T18-50-09-643Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T18-50-09-643Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "ecf1fca8e2bb78c888963fefd36b7d31643d38995fb788b75d66ddf2881997d6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T18-50-09-643Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4f5710c0dfc93bfe9d2255634cb0151df432c5db21cb7f4af727bb5f3fba78a8", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T19-02-52-447Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-02-52-447Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-02-52-447Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "2e6f5072e4ac3e4b40cb7a5a5f6ce576c60c7b50a0e99786234c3c3795eb9c3e", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-02-52-447Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a7d54df9446adb7a1105d112268d916306e262c674c7cc5fd93acee71345d8b9", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T19-07-35-578Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-07-35-578Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-07-35-578Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "44709ea51f49bfadcdf6afc2822673f05a27f3744b48b63f7dcf049aabe16d20", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-07-35-578Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1c85aed617c1f4b9d1e11b5f3c72cb731e4ca992fd48aa2abb6b8e68a665c02a", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T19-15-14-245Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-15-14-245Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-15-14-245Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "f7850d003646d7d0408b96b2038d1d2b21aa2f1310ce90548527a25f3539b560", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-15-14-245Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "72f4c5d557490f019f6d5625666ded1ca4d8c8adc011cdf38bc7c2ff4cdd4c01", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-16-47-761Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-16-47-761Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-16-47-761Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "7a8715667bbc8ab307f971cb6c6efaf28a9df1d377558ec9385e81d942b374d8", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-16-47-761Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0828a130fdbd8a82375e075a4d1b492f1b6e17d627b07ce042a477b2819b838f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T19-17-02-131Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-17-02-131Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-17-02-131Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "fb5c9aedb19f6e6b7e6115f5bece71694cf7035dce062f151d8c471160b6e61b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-17-02-131Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "70c7af956ed0e5f207309ecc3cf017102d3b6c2e727c77ae2413e3a1afbebe4e", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-30-48-493Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-30-48-493Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-30-48-493Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "c6c4b7add01b6a9fe582f8694f53a68e80bf3a3f059e435e320d2b06fddbb97b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-30-48-493Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "85f1bc89dd66d01b4775aafbadb6752bbbc871ce3a26c4680beb573c6e9e1c56", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T19-34-09-616Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-34-09-616Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-34-09-616Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "caef78986a1ed8180d402eb9a438bcfd525e0c5a0307d721d7f060da0aaddc39", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-34-09-616Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b0eae7884e2e187694a9332095fd546fa8d4fdb3fbd7cee64d894646e7948390", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-37-09-864Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-37-09-864Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-37-09-864Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "e0337b0265adccd298198ad860e3972a6c4edd3eaeb0d6c0172d4480a9234b78", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-37-09-864Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "80d8da5966191cf6185140cf07aefab58ec649d002cd866146348ce8c7f26bc1", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-41-31-181Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-41-31-181Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-41-31-181Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "3c15eae99113bf147702e2ccb9b0dc907e623607603d3c01fefc769e9a7d560f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-41-31-181Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3b64c31130a069a704bbe5812c5e32f4d3fa813ce9d90a53409582ff3bc9c087", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-44-50-429Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-44-50-429Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-44-50-429Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "ee3e6f4f0e52838dcf25911a4db6a6eb7d0848884a8fafad1f68488dab117947", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-44-50-429Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7583daaa7a746c6ef658da2ab7f247cd9da818a438f3bb8c84b77f7e2a3e1e29", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-50-40-116Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-50-40-116Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-50-40-116Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "6f4b8ec8e5e031179794842dd7faa0ec16251042791e287e0e927f22281a3e66", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-50-40-116Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6dd431311544a9214feabb93430d0b5d06b9330f7179c1df12dc0d795a766cf9", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T19-52-24-075Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T19-52-24-075Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T19-52-24-075Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "9e035eb0f61084d7d9f82b2872bc9434fc00f3f4a17e3b4f23bdba7eecb903e9", "prompt_bundle_path": "/benchmark/runs/2026-05-05T19-52-24-075Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0494448f662b102716c852aecbda6d5ec1f8353e430d76499112bcbbb271dc41", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T20-00-04-419Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-00-04-419Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-00-04-419Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "616e2225a4fecb3d8efaa83809e9cbdba60a6ea9d5296895f31dbdaf723d7114", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-00-04-419Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "95fe46698b32c9fcc5257d0d597598c63fa3a6089315c535d91959778bcc65f8", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-07-40-253Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-07-40-253Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-07-40-253Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "f0aa350f131b8229abf53f58b349dd70a6a7939d5a648ffef88e513cf4f84e7d", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-07-40-253Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "49d92c30fc6f188cb7f6bd58e3cf170089fb7b0a0cbffee9b99ab13e5918f689", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-14-34-144Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-14-34-144Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-14-34-144Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "eec1ce320cdb3f50816da848c985fb6501c4be063d427d1c42074278556b784f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-14-34-144Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "40186e1ec32f4e91eaf3a1405ec6e4081490d8f818190ca135103e17c493b54f", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-20-42-536Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-max-baseline-attempt-2", "kind": "agent", "case_id": "langflow-ai-langflow-low-loguru-file-routing", "case_path": "/benchmark/cases/langflow-ai__langflow/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 2, "run_dir": "/benchmark/runs/2026-05-05T20-20-42-536Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-max-baseline-attempt-2", "result_path": "/benchmark/runs/2026-05-05T20-20-42-536Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-max-baseline-attempt-2/result.json", "result_sha256": "722279063b0eddc043e0d1d8a0d49f26187cc84c2328cc248411b04ac26e7fcd", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-20-42-536Z-langflow-ai-langflow-low-loguru-file-routing-agent-cursor-claude-opus-4-7-max-baseline-attempt-2/prompt-bundle.json", "prompt_bundle_sha256": "43b716a039b0b3571a6fc880e951fee3192a824d3f7d614add5abfe6cfcc7ce0", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T20-37-03-258Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-37-03-258Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-37-03-258Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "453bc08e80f592b4f6ed4a014c9a0c48c78c62938043811a54c403ce3f298df7", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-37-03-258Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1987a1f9a82459703acc971fec320892cd86ceae09b03f4bbf9d7c4e246358e1", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-42-10-805Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "codex:gpt-5.5:medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-42-10-805Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-42-10-805Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "841929e7e90d2d0e8d89e7db5ccb55043e7d894dad3de84bc0b9fbca1631ae11", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-42-10-805Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b492dc1bde29e06ecabbc4b7a319fc6abadd3789d645a55578e233283430a291", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "medium", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T20-44-23-344Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-44-23-344Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-44-23-344Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "c289e53a9615c42d5c6593c6817502b319a0d4137553c872cd759816590330d2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-44-23-344Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a2617b094dee5165e99ae8298e96b84e885275f009551b0efa3a8613b13a1a50", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-47-57-688Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "codex:gpt-5.5:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-47-57-688Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-47-57-688Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "daf631ff258fc5f637ddc6c026d03ae0bcd600698b47a40f8d6297eab4dba4a6", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-47-57-688Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fccaf68e9778b47f36b1ecefa3a9bdd86fef31b2a747fb27c2e3c19ec02ffd26", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "high", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T20-54-10-429Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "codex:gpt-5.5:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-54-10-429Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-54-10-429Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/result.json", "result_sha256": "4b0ec25e7ed54c108bae3f52d6763f53398da13478f08d0922412ec0718d5a3c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-54-10-429Z-vitejs-vite-mid-deno-workspace-root-agent-codex-gpt-5.5-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1496a526878f8ef746f8ee6962087009e3f24b751ded4df373e98d54f49443dc", "invalid_run": false, "harness": "codex", "model": "gpt-5.5", "effort": "xhigh", "harness_version": { "name": "codex", "version_string": "codex-cli 0.128.0", "binary_path": "/.local/bin/codex", "binary_sha256": "baefc109b871e73a7bab298ee19b8bf73c8b647c4f8649a9794fc5db01db17b9", "captured_at": "2026-05-05T09:22:44.390Z", "raw_version_output": "codex-cli 0.128.0\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T20-58-30-348Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "claude:claude-opus-4-7:high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-58-30-348Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-58-30-348Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "02fea70caca5cc21ff600a7252f66e85a0272b52c56e25254f603a310b34d3a4", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-58-30-348Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "59132e54af09585534486aa34334afb1970142fc8c5e27450c6801dfa3e5fa3b", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "high", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T20-58-42-765Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T20-58-42-765Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T20-58-42-765Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "ee54ce5c515411e11587f3a8a77c93641e95e1d5c026e3585657669ffb61ccce", "prompt_bundle_path": "/benchmark/runs/2026-05-05T20-58-42-765Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7618c94ee5ac4eca3192da45d68fceafc0ffd59a3c9d9f1969b4198115532a30", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-02-24-973Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "claude:claude-opus-4-7:xhigh:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-02-24-973Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-02-24-973Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/result.json", "result_sha256": "67ab9fb33d9f8060a9476f0e5a31d35ff15883b9263107d306d4850797c834a3", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-02-24-973Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-xhigh-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cffe47729dc7a2d756935762b0643726cc0b10db398ac3e25376b6a0c6572e8f", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "xhigh", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-04-25-284Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "claude:claude-opus-4-7:max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-04-25-284Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-04-25-284Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "032cff7d1d8b333291bd9881e66cec99d57e4b97661564417bb39da9757d214b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-04-25-284Z-vitejs-vite-mid-deno-workspace-root-agent-claude-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "77260daff138772643a9ef6c938bd9726dc13eb3370442c815e620b87ce7509d", "invalid_run": false, "harness": "claude", "model": "claude-opus-4-7", "effort": "max", "harness_version": { "name": "claude", "version_string": "2.1.126 (Claude Code)", "binary_path": "/.local/bin/claude", "binary_sha256": "fce96968d275161ff65a4c19fc6434efc6973d9f6d35dc3992a2ba0553cac18e", "captured_at": "2026-05-05T09:22:44.430Z", "raw_version_output": "2.1.126 (Claude Code)\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-10-49-492Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-10-49-492Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-10-49-492Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "1e4f75afb0e4d3fd67133e6bf9a07a71d84136c2e49f014c256e925e8d77ec54", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-10-49-492Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e77a61dac74eb42f9fcfaf249556911d2240c0742b6c1feacafbe7fd9600cbf5", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-20-18-356Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-20-18-356Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-20-18-356Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "60a0244a7e5c3debded36afdad2455d74c50274d3201cdf667211a6c269d7835", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-20-18-356Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f64a021d0430fde2eaeec435df7a4c350552853da0c65cd5d5a1291d30596855", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-31-49-006Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-31-49-006Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-31-49-006Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "131eb775122782de2cb57f1b4c4f5e644cc64c63e7f8adf556231667cc58e0ec", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-31-49-006Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "eac9d7bad6ddc23851f8d681216f7bfe2088f8aa18302d40ee0734c69785833d", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-40-39-114Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-40-39-114Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-40-39-114Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "e04def897fcd174c1ae527af2a83b563373a8937ca3fcad7e355c36743e7859b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-40-39-114Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4746fb8f25d9706b3ee2a784f7036db8aa408f4533df4174b97665fded2771fa", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T21-54-10-917Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T21-54-10-917Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T21-54-10-917Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "9bffeb8159e5f5b9a38370495a423c4de3f97f3b22032fa242db6089dcb8c192", "prompt_bundle_path": "/benchmark/runs/2026-05-05T21-54-10-917Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "76a297c414b70923b0cde2b702eeeaea8ecf3f7669745e560518b1b7fead13e6", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-11-32-914Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "langflow-ai-langflow-mid-mcp-connectable-inputs", "case_path": "/benchmark/cases/langflow-ai__langflow/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-11-32-914Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-11-32-914Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "345de97e44b65f19d2e6e8ef3abb367a47be12a220a9eb79697b5d0942acbc95", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-11-32-914Z-langflow-ai-langflow-mid-mcp-connectable-inputs-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9f2ba2c1f6a2c8eeb44c8931660b243420bf78ccc107569701032497a3ddf475", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-37-00-981Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-37-00-981Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-37-00-981Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "5540385df859c164cdae5aa3dff657cdeaf634396b018d001e3cf7af0449bbb3", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-37-00-981Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1b9e9d9ab441b3543eb6c736fc35b096470188e2384512507185007ada72429a", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-40-19-753Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-40-19-753Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-40-19-753Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "51542ebda4cda49839ef58afb0327ea5d906d6d7a67866f3350fe3f770e2e1e1", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-40-19-753Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fdd949b471836d7f2cae77e60023a199b3f5079e7abe59c5852132382167c38d", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-46-29-381Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-46-29-381Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-46-29-381Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "75df5a5ce965d2407dbe40d956f697f89012f206e3300ad85c2162fb8e5a839f", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-46-29-381Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d37943ebaf6fb81f8e65aa9046446526711b0db851860b6828e0e7f542ae788e", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-49-55-838Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-49-55-838Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-49-55-838Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "bd8b8335c0ee74877e3ec5e713bc10fde4b7df9dfff218bb1dcb5b96024f624b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-49-55-838Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6ca39535af6016256b78dd36f5745a5a8c56cd555abf648124a21504db45f003", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T22-55-24-451Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T22-55-24-451Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T22-55-24-451Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "1c522a7bfc932d46af26158e05a0aaea93ad007980632779803ea63be659460c", "prompt_bundle_path": "/benchmark/runs/2026-05-05T22-55-24-451Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "02609977c691e8224b882a5db5e4b6aaf1e861ffdd26e78e55849f8ee464b2b5", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T23-01-47-632Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-01-47-632Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-01-47-632Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "74b93a036111a3f21852b0f2d7a94536ac4dc561f45628515a4260be1e8681b3", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-01-47-632Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2f773549ada8a7e96a9a3aaf885356684fc970dfcfa1d3bbb2d84823fc0af4ba", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T23-07-37-868Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-07-37-868Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-07-37-868Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "3c7719dd86e08eac49b009c4d19fc50397c0d332482c78a32afe9a8211d582d2", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-07-37-868Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "18b479a698ce86cf6a7e6d7182bbc61be681cd3a7539547272c4726075e629fa", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T23-15-24-097Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-high-websocket-auth-options", "case_path": "/benchmark/cases/louislam__uptime-kuma/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-15-24-097Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-15-24-097Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "e4f4c70e7c102fd180a85243c1a4f8e3fa297015070e30a4d377847dd58ce133", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-15-24-097Z-louislam-uptime-kuma-high-websocket-auth-options-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "84f1deffe2e409e2f82b1d37a2425748a0a36487058104409a6e073627c66a76", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T23-38-15-456Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-38-15-456Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-38-15-456Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "5c4d167c9522bb738390ff20d6d79ef31bc3835cda00377e13f86913102bb055", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-38-15-456Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "52c97aeadc27977a10ec3da19ee7a9d1b402053ad805df13645a087630a50e78", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T23-41-00-308Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-41-00-308Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-41-00-308Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "00956345a2313cf673949c2ff445bd947330d7ccd6a293f80591b1de69befe11", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-41-00-308Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cbba848f2586fa3d930497720ee8dff5b8d1a261ca2da56c387c73f8aaa7558d", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T23-44-03-864Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-44-03-864Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-44-03-864Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "efe5501f421eeaad1e7207a67ec1438d83a363810fb5e9a66ca8b27f1ba26e94", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-44-03-864Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ec471fa5cdb6c49c399276715f06182e2f79077398dd9817589612f061e1cf25", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-05T23-47-26-479Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-47-26-479Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-47-26-479Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "415673d340bd40fb2fb721a5df835f5c36cf3acc83c8f84db2c2c7d927557234", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-47-26-479Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e95140efe9e623d90b3a330523f0afff844236e44b179a43c871d0d16ccca48f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T23-52-23-016Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-52-23-016Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-52-23-016Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "1006443fe24a770995ef38f5d4d186cf3a7ed9950d6c24d6591b115ab947354b", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-52-23-016Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d62b2f3f299e320c1215c93e3d878bfaa8177c6cb879a88150fa0642c370a2cb", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T23-56-42-001Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-56-42-001Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-56-42-001Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "4c77b9838c0999a2feaaacda9b1c5b9171e4df188b51fe80ff0bcf703d1753e3", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-56-42-001Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "caefb6205f50beeeeca62612b05490b1da9fcba87c6a0020fd0560f6c2227afa", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-05T23-59-16-088Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-05T23-59-16-088Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-05T23-59-16-088Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "62c970d3b53ccdb3df925c2cdf1a5099f4e374d7e13b62a30e9171e0bd523b83", "prompt_bundle_path": "/benchmark/runs/2026-05-05T23-59-16-088Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "40a71293ebc37cab2d24cc8aaa905e09e693cc9c0cc1ffeec47fff6da6d569b8", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-03-18-572Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-low-submillisecond-ping-chart", "case_path": "/benchmark/cases/louislam__uptime-kuma/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-03-18-572Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-03-18-572Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "9d5c468722270f387dc457da0896facaf4c69bc9867b6f81aa87daf76239ea35", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-03-18-572Z-louislam-uptime-kuma-low-submillisecond-ping-chart-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ebba31b9a06eecbed3d38dd9a653f32e1e10c7b907014ef632b8afd52f6c4325", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-09-54-600Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-09-54-600Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-09-54-600Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "2fd77a07a4b0da277c1c168041a20b1082710626bfba64879d0c3027c1dfb9a3", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-09-54-600Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "72bc55a1d24bb0e388fe380c76fe4f7915e4fd131536e210de94c3e3b70932c1", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-11-55-794Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-11-55-794Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-11-55-794Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "628d4f6666db8bb8e1c6ebf5c87cc7f51ba3630f661a470678d35644704e4688", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-11-55-794Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "69ed1d7ef4de1c234277985998c85f0344db184edcbadc2a3f9d4d115aace427", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-15-59-393Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-15-59-393Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-15-59-393Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "e75a908216f46e4cfe763cd47b633e5cf76350600a1baf6af57ff3348d74a93c", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-15-59-393Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f33dd7a19c6872509ca8c9ed35b2ca513710be4c56ecdc808623e7778d0ab8cc", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-19-09-994Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-19-09-994Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-19-09-994Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "b802718faea6be43dbc18482e0a9105659041975f3434876ce5db5f1d93c0d25", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-19-09-994Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3419c5dac32172bac13e945df3786c1e54c986343f19785ba20051ff2ee82fac", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-23-28-753Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-23-28-753Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-23-28-753Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "2dd7d36535de553ec3c379b8c764583f00b2316467f543082701b725136b1ea2", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-23-28-753Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "15e4b897b69738a4bf95d80da902a3a140c995ed1da536276e25539c15b8c9f0", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-27-14-318Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-27-14-318Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-27-14-318Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "ad4dcaf4df12442233e45ed6d57020a6e2c8e5177fe89a30aa9ca9295313ba6a", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-27-14-318Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b1053916b80f2d64b3f5a590760fe26b37d3a3a44c09143a73da0f076b74825b", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T00-35-25-801Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-35-25-801Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-35-25-801Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "b17544fcb928a2c6d1584330e6b83bf32b489959c2dc5865e51241657882f1fa", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-35-25-801Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a41d1adf76a82b57becdf9ec7353e6aea225d3c991c9d26a9be36ecd142ae80c", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T00-45-57-098Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "louislam-uptime-kuma-mid-uptime-cleanup-buckets", "case_path": "/benchmark/cases/louislam__uptime-kuma/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T00-45-57-098Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T00-45-57-098Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "3230a07707fe63caafcf1989ee37b2ec6bc3d257281671cdfa0ad01080b038cf", "prompt_bundle_path": "/benchmark/runs/2026-05-06T00-45-57-098Z-louislam-uptime-kuma-mid-uptime-cleanup-buckets-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9724e2094903eba3d3c108af0b8e1afe8062e7fbec64561f7ed4efd3c9e6186e", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-05-40-568Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-05-40-568Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-05-40-568Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "d7e57adc07ec7071e2cc3d9b6b1f5b6426de4268808e150eb413abf3e2969e81", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-05-40-568Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "158b68def73317996df56f85b8dee00cb74b368035cfa0a4efb9dbea0a0c4faf", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-08-48-740Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-08-48-740Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-08-48-740Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "6659f835b1419d9aadb9bac9b6bff9643a15711a71d49ff2ce18dae6c6ddd555", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-08-48-740Z-sharkdp-bat-high-fallback-syntax-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a62144a18dbdd9bbbf241aefa614f6b0ec11c94c125e5b64528d9d0260a3e963", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-12-10-147Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-12-10-147Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-12-10-147Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "bf160139da0bff03adebe112a23a92e6f92db33d347d628bc7c3a1c1ecc245a4", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-12-10-147Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "69b28aec37e71f523b91b8273ddaebac4b3b85f87fcca6a2819a1e3f8576f05f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-16-03-131Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-16-03-131Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-16-03-131Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "d93b8ff7cbfb614cea2d82d9a66a38a85b7f64e72949a4d96923c265db51b110", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-16-03-131Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "34b0870f1ef8a940b14be807218a22104c03d01bf2b953c3f4c0bca7cc0a8e29", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-22-45-664Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-22-45-664Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-22-45-664Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "76f705b84998d218c73f64726efbcf2e14088d348a98d20070c24f71d89d3350", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-22-45-664Z-sharkdp-bat-high-fallback-syntax-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f04c2896b6bac7968949e4abb2a59f5b4ccb30c1642183083bf96cd57ecbfa99", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-28-18-079Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-28-18-079Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-28-18-079Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "96210896c405b0bb0ae135c29a5e72ba7df64d3693d84ea631daf73c3eed70a3", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-28-18-079Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "573efef4069c27ceafd4bbe0ceafc974d358efb9d2c695d1813d8ccb22726a55", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T01-35-31-595Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-35-31-595Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-35-31-595Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "06fa8fe13574498c5f73763caa4dfd26c7bd0755560918e1af0384190531f6ed", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-35-31-595Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5e805a5295e18cf5db8138a667656b69a5cdfba2bd56395a3d9276b9764c05dd", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T01-46-41-920Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-high-fallback-syntax", "case_path": "/benchmark/cases/sharkdp__bat/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T01-46-41-920Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T01-46-41-920Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "7fbad25ee174783569223ea792b4c2a5fb92cc141066acc7ae726de6e78096aa", "prompt_bundle_path": "/benchmark/runs/2026-05-06T01-46-41-920Z-sharkdp-bat-high-fallback-syntax-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0460e846a58048916e27736120cf4ed77e66f4176d9abbc462eee4b45f995c34", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-02-26-502Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-02-26-502Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-02-26-502Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "e8f787f21d116c625ba8c2c278776721639e41f9a4e1d7ce98ac892a983d46c3", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-02-26-502Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "deb037c25b5868e984f8219261581221a2c7cf21ac8d03c519d60623b6d6beb4", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-05-20-408Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-05-20-408Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-05-20-408Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "e452d828bbbf7d2ee289ff373068dc7b018ec35d6659684f7ed662348128a51f", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-05-20-408Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a356488b8ab1550de958922bf81cc57d99af82901c1f01a4da1581448e0b16d3", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-10-37-099Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-10-37-099Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-10-37-099Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "99a28a5bc0cf6bfaaa289a1fe10e22de6d40658a1453e4af60d45e007084e2ae", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-10-37-099Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "002ad7fbc126dad910dad648e490166698d49c95f6b1e21d29b73e946f07310a", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-15-06-970Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-15-06-970Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-15-06-970Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "4e03ab15b94e26c0ee492afba1856752319741f7d0293e68112e88909d48fb03", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-15-06-970Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "2369a3f40c53d9cc587704c179647593a72c80920b335d97b7378d211ea8469a", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-19-46-380Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-19-46-380Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-19-46-380Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "52cbb4d2b5aa1631ad4db4934afffd8ebfb9bd3183eb46abc25571f19e307ff5", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-19-46-380Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "83db1d6090a0288ebd40894634f17a9a5fa7d4634761853da21d1f96c00b1dcd", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-24-33-984Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-24-33-984Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-24-33-984Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "9ce238c0a1fbec84e339be5976580ab2e07c6320f1a5d5d6adcfd60af485ebd3", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-24-33-984Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5a7ab828c1a9325871e1ab6f2b877e4456c966731c7daf7b3259aacbd3fde974", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-31-37-223Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-31-37-223Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-31-37-223Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "74c88f2a99af174066a5b56a067b8c8a9d45dc5c77319f6a9be13f7be4d738cf", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-31-37-223Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c8dedfc0596214dbaab8c389ba11ef5076ea98e7a81a7d183d639bec926ffff9", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-41-32-737Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-low-zip-binary-detection", "case_path": "/benchmark/cases/sharkdp__bat/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-41-32-737Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-41-32-737Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "704009d00b1cc8b6c669a8783ab6cb9dd88dd9dd8b40b1a4fe85f6b83e0425bc", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-41-32-737Z-sharkdp-bat-low-zip-binary-detection-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "68b1f04a43f4cb2134027b79cfabbb70e56374b997143d0960286ee18b847c7c", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T02-53-44-540Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-53-44-540Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-53-44-540Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "cb2608b3edda108bf96cd36332722add37b310272134d8b2afb79255d23cb3fa", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-53-44-540Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ebd7e7216a45c3b7f8e795841babf5801f9e34d9e44d84e7f1a93a6e18081309", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T02-57-20-236Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T02-57-20-236Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T02-57-20-236Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "0ae06d12e338456ed90a638a01203b60069084b560c50482f34ac1e86c257234", "prompt_bundle_path": "/benchmark/runs/2026-05-06T02-57-20-236Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6688e95a1e33ece4206dcb1becd0f96e954d2b5b557cfc2e02d1c92f96cc0a30", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T03-05-38-898Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T03-05-38-898Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T03-05-38-898Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "8ed58220b2aa52ed054af93f153b4cf483e941d02e00a377fb00eb0fa2834deb", "prompt_bundle_path": "/benchmark/runs/2026-05-06T03-05-38-898Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9fcc4746d3e5dc0b05942aac2f02567c2c6a39156cd8dc34d176b1f95b7b02da", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T03-09-38-326Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T03-09-38-326Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T03-09-38-326Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "62e190e7c2b0af1062c9b6734ed64b98efc7096d7416b6ebc878a1c006a0c0aa", "prompt_bundle_path": "/benchmark/runs/2026-05-06T03-09-38-326Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "24685207b1d21f45ccb60238e13975237e183f02a65c427d000811222c20657b", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T03-16-34-521Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T03-16-34-521Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T03-16-34-521Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "2e68c6c5ecef7057483665ecb69c1a6d01179da03ffb7800dbe6cdfcac7132df", "prompt_bundle_path": "/benchmark/runs/2026-05-06T03-16-34-521Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c59ecfc3e959070e8006b43554092cefc09754240254b18c919c88d60fd73e15", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T03-22-02-814Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T03-22-02-814Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T03-22-02-814Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "a7f54eb5f560d9067dc7d985ec461c0fdf117c7217671ab5cbdf263c97b81c09", "prompt_bundle_path": "/benchmark/runs/2026-05-06T03-22-02-814Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9df0624cf3624012234ab624feed1151a25744529df6affd51e737d88d12b3ff", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T03-46-26-444Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T03-46-26-444Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T03-46-26-444Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "25c82507048f1b84f1271a06ef488506d6596c2b508d615960595d52b2983306", "prompt_bundle_path": "/benchmark/runs/2026-05-06T03-46-26-444Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "c63719f09ac899f45c06fd2cefb6a28358d43d66a13697280963c2752abb096e", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T04-03-59-744Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "sharkdp-bat-mid-control-character-wrapping", "case_path": "/benchmark/cases/sharkdp__bat/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T04-03-59-744Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T04-03-59-744Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "0cba80f8f0befac9495553778dd2485bc4c9215ff9a0c97285fca886aed720c3", "prompt_bundle_path": "/benchmark/runs/2026-05-06T04-03-59-744Z-sharkdp-bat-mid-control-character-wrapping-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "768debffd3781ff97d739242b9f3151144ee0d3383465dc388a9d3674da862c0", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T04-51-49-967Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T04-51-49-967Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T04-51-49-967Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "50fd5a52c878f724b61bdb0b42e9fcb6250757f6757202bbf537e1946891b89f", "prompt_bundle_path": "/benchmark/runs/2026-05-06T04-51-49-967Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "3050e9176d5dccd2efaac66ced2a1442dcb7546bcbd24a0515b0f8d00286ab53", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T04-55-17-021Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T04-55-17-021Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T04-55-17-021Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "7dd4ae57db1830306f5d6681d4c37e963312b99101348fdb04b655b89cf078bc", "prompt_bundle_path": "/benchmark/runs/2026-05-06T04-55-17-021Z-usememos-memos-high-missing-related-users-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d63d6ff1476917bf8dbc7a76dfb9d53ffe2ebf51a919c03f097b3446e63b8ad2", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T04-57-25-004Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T04-57-25-004Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T04-57-25-004Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "e4ae2294c397e446acabbec1e7cb80b37ad50ed710cf9184e0b1c3da29940cc8", "prompt_bundle_path": "/benchmark/runs/2026-05-06T04-57-25-004Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "fda7a76f11b4f274e01b0ccf381359dc0f77ec7bb9c005050aab152dcec71831", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-00-36-295Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-00-36-295Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-00-36-295Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "d2af031bbb3a0350a4021872fc7ceaa25cfea220f31db0071192b958d7f0df4d", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-00-36-295Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a89de1d4b8dc3bb218e64f8c05ace468ae2d5976464a492c8a74ca9a725b174f", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-05-12-324Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-05-12-324Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-05-12-324Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "7c6bb9ec05fda82beac32748c96bc948cc79a4fdf8d50d635f3e01ef31df5cd0", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-05-12-324Z-usememos-memos-high-missing-related-users-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e39d479af4a9196f97dcfad35bf49ae225e2c3e58f30ab15e98d2ff3196b33c9", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-10-11-181Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-10-11-181Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-10-11-181Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "9636cdee249f2c575eb645ec6bb9a220f86c22f4232b241205eb6caf7d12b46c", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-10-11-181Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "894ea82d75f7cd02d6955b5b0e3daa947e9a6dc1d7012791ed6e8a2642a1bb03", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-20-55-259Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-20-55-259Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-20-55-259Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "2cdd3b1ee36eca23ad6bd28574439732513ed5f31e21c787965b419fe4a05ac5", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-20-55-259Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d504249244df64bc08e291e60e743b2354e6d7940c58fc7b4f2f9f5b2edd0e9b", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-33-07-461Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-high-missing-related-users", "case_path": "/benchmark/cases/usememos__memos/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-33-07-461Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-33-07-461Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "6d55b48bca2c7ee87a12178abe76f7501f6299c86ba464c373822d199f288ee5", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-33-07-461Z-usememos-memos-high-missing-related-users-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "93a4bfddd9b196da1d45fbbf66fdafcbb612534e461716d63f64a9bd133e53c4", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-49-36-957Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-49-36-957Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-49-36-957Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "a9d78c02f8efdb6fae60bed1be5bc86822bff89a82cdbcc6972c4d5ea237377c", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-49-36-957Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5c6f4ecc00d15e34805aaaef2fac21cfcd44a5cc373bf0bea365326d091822bc", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-52-25-497Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-52-25-497Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-52-25-497Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "54c87bd486746e685f88b48582359a8cfb5aa72b8c9499c1fcddc0bfa51e5ae7", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-52-25-497Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "6c1df697b3de23f36e531d106b9f7df0ebe0d2dd10f2ac984d2ae0a90cca565c", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-54-41-937Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-54-41-937Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-54-41-937Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "a8ef9371961bd4a0e49c34f0a6b1bbcf7b1bd71077958730d413daccf4a0548f", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-54-41-937Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "444908efa5b2adc077af1085e33c507699b1056b6c1ec40b8baf5187123ff597", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T05-57-29-637Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T05-57-29-637Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T05-57-29-637Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "d849559d3071ce05dd30dbc01af0833a873b8d86dc572ad65451cb6d05451376", "prompt_bundle_path": "/benchmark/runs/2026-05-06T05-57-29-637Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d69d9bdd823049550f288cd75fb81bdbddfb3b1d66d6fe816c76ede0184aa5ba", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-02-06-913Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-02-06-913Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-02-06-913Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "c8a77fd5d17db285eb126c9a665a4b6e47dc84a62f26d48294a8475c61fca204", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-02-06-913Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8e1114dcffd3526ee6867be89114e23c2ad672e73faa9f5f836b796c66981eb9", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-05-41-299Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-05-41-299Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-05-41-299Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "9bb9119a3466152ae0b9a0c2686ea636038be59d9af8e24d2b192e92acf0ee35", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-05-41-299Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4e4a67604cff6bd0119fc586ae8341e3bea243d553263eae8a67d2e77e566af1", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-10-14-051Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-10-14-051Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-10-14-051Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "a842e93be5adf08e10d3dcffa79d8ed5c3ae98ed575977901d393ecba5c02676", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-10-14-051Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "cd45da3def987ac28cece85d72ecf1fb15d66e7fc67eeb7e2b2ef04fa1780adf", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-21-33-811Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-low-omit-internal-user-settings", "case_path": "/benchmark/cases/usememos__memos/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-21-33-811Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-21-33-811Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "1ef13a6d9fedfb8c7b66850957a8cdfae05606e67ff7de333be3adcb2997eed9", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-21-33-811Z-usememos-memos-low-omit-internal-user-settings-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7d0659d47e4f6bb961c3b54c032dd1f2236bfe4e79b138eafac34dfab2e776cf", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-34-51-016Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-34-51-016Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-34-51-016Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "9c003c98f16208810ff5cfbb0fbbe32214e06578a501d9e6ad1c0d52231eccd1", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-34-51-016Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e75a6acac10f13feb8f494843754be8c31d776f4021cc3cdc08801b5d34cf7c0", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-38-39-758Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-38-39-758Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-38-39-758Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "a024e9f38bce8b6bdce6dfca0427991acb28f3d445663169fa4af8bb6b24c8b2", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-38-39-758Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0482028fff6fe7dd01da1fd97247e9973f90bea3843fd1d6c713de2ae4c0f67d", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-43-18-350Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-43-18-350Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-43-18-350Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "be54626099765a0a2b37e6ed622b1e623629921e9418b91a66c045fd34ad0861", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-43-18-350Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "93eea49f6b7a75c261fb181a5ce75426796fdef6a229441e927fdea99776e351", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-47-11-624Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-47-11-624Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-47-11-624Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "461cde1565c1b2a792dcd272c90f57a83e0bfc0c6c80a8c0b237958dc2820864", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-47-11-624Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "97b1ec0d6358761f1ec94c37c5b4b4b0dfdf0d402cf11dc5aaca20c64f273160", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-53-00-697Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-53-00-697Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-53-00-697Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "c134717b3f2ba723fcf5508d34f84011a613e90f962a716e53560832be44a225", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-53-00-697Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "313c6909dbcc5298c6cb0201d9341f7330a21252c6d8c6fb904517422cd0201e", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T06-59-36-415Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T06-59-36-415Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T06-59-36-415Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "7833e9f548299cf607ec48381c2ffc5707613134fb505b5568ec23539e93b66e", "prompt_bundle_path": "/benchmark/runs/2026-05-06T06-59-36-415Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "b7cb4e8f0284bfb2142f66d796cc18db2dd31adda6c222e6573d3f5c7ac6df6e", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T07-06-13-380Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T07-06-13-380Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T07-06-13-380Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "3c2e9c7351ab58b2d2ccc82281fe81ee7d6fca34920f1d2cc09b868b259d0b56", "prompt_bundle_path": "/benchmark/runs/2026-05-06T07-06-13-380Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "60716156c288c4a8690bf740d2af16a336a0d6666fb0a053cb15cfd089b6d533", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-01-05-032Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "usememos-memos-mid-mixed-case-user-resource-names", "case_path": "/benchmark/cases/usememos__memos/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-01-05-032Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-01-05-032Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "d5a58f9e84ee5b7664e211614ba5e188dccb4f5b19ec711f2f1df3a2cf03891e", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-01-05-032Z-usememos-memos-mid-mixed-case-user-resource-names-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "a55a680c591bcb3d4a1853b9226418142fab989a5c681d371e78ba56e9a37cfd", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T08-11-40-698Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-11-40-698Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-11-40-698Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "def9993ded6858fe1c6aa86a6e1fd57a204d866fe40bf11eacc02e029a4032a6", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-11-40-698Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e1a52517a62c25613e948e6a0a07adfb80c27d4cf2eb334502e8ccf12500d587", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-16-57-699Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-16-57-699Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-16-57-699Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "14761d6a3120a09e5c367ad1549a726d528f654a48700780dcc1d2ec5cc4427c", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-16-57-699Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "82a63f0e891fd2eb3d7ced4f92b6146b451301590c10b3bbe6420dcd871e8273", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-21-27-747Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-21-27-747Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-21-27-747Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "0c411731a933e66cf02e07ec40eabd5fa5f4de53adfdd771e4d5f905fc7f733c", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-21-27-747Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "e259bad7847bacbc38dc5dd602db2101ac02ebf399ce14effef63bb5fa90aad4", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-26-49-502Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-26-49-502Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-26-49-502Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "587cb864dd57e68527bc4937d8e8fb53bdb78307422913574cc73f8c60661fe9", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-26-49-502Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8748ffc5ec95445e4201277764400d1cd31031080565e7b35a5d255046d9a86a", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-32-44-745Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-32-44-745Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-32-44-745Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "56ad63a30152cb7c53a43a833a8962ef95be8b5f5b28f0baa3d062909d6c7896", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-32-44-745Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "8e371d03599d05b35f7bb04b2461488a068604b25342bebe72cedb3b057db03b", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-39-33-121Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-39-33-121Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-39-33-121Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "506d6b8d62c609a225f6387bb221ac4507ee5dd620257340d71f08153b2b0adf", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-39-33-121Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "55be624ce239bd5f825b71045aaf86f7b3775c2174c52a745243e461947776a1", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-47-26-090Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-47-26-090Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-47-26-090Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "276d9b2bb8a80693e52534078d6e467079a3981e85b814c3009e7e9d21296c9d", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-47-26-090Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f0f0d0b881ef94ca3fa777a9677c9a77f6dccda8e776b8dac5c3dfa885034cf2", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T08-56-42-610Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-high-hmr-patch-esm-sentinel", "case_path": "/benchmark/cases/vitejs__vite/high.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T08-56-42-610Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T08-56-42-610Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "179f5d65cb326da542dfe592f320cb5fbfbe34cc72b138ae4d61e1f5a04df540", "prompt_bundle_path": "/benchmark/runs/2026-05-06T08-56-42-610Z-vitejs-vite-high-hmr-patch-esm-sentinel-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "5d5c64ebea816180cd3e975ac59c88b0f6fbd8b4f0f28b24defa9332e15755d0", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T09-17-20-654Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-17-20-654Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-17-20-654Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "58100ddb6ec0dd24e8b59c36b817ccccfc1b6a2b793c251f1df219142eda78c1", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-17-20-654Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "79af8aee23ce835b77932df266688464e33c4a3913183878eb739d1b2ec5b270", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T09-20-59-381Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-20-59-381Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-20-59-381Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "1b5b834e6b770b232e17c95581475e56462fb731fa3adf97438706d4b533add6", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-20-59-381Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "ec4a29e25c0c0d5170e1cf889b40ca743c2c6403af583ad0fbbb59298719bb30", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T09-28-48-850Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-28-48-850Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-28-48-850Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "9e1a0d579d18030e778024c4f3bfc025281a513f3b44d43db982355e58a1ff74", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-28-48-850Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bb734c51c602b1176037728a937a407aa3f3634a71d611f80960082447a15e01", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T09-33-29-636Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-33-29-636Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-33-29-636Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "2d50f32442645db73e5fa48a6a8adf3104627988fdf72b864c61b81d7c7eddb9", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-33-29-636Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "7a56cc82a6c9bf3b9aac99bf53528a0c50b4dd9a1bd19432711059a81b07c351", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T09-40-14-578Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-40-14-578Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-40-14-578Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "70f6b3840b852d3b2b0e9bdd36cf9561b9b4e91e317e465b14e29ef87bda3cfb", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-40-14-578Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d3dce992a2ae4edbcdf110bc9057540dcd4e446a7c26e38cd3c953df9379c0de", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T09-47-13-574Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T09-47-13-574Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T09-47-13-574Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "bf77f55ff704b66793c916b9862d0404f73c90f3d5d8f7ee50e83bb1ed781c99", "prompt_bundle_path": "/benchmark/runs/2026-05-06T09-47-13-574Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "bcace6afff31c74d258c6d6aa4b65ee1952501530134561eca9d90542ca3e973", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T10-09-14-870Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-09-14-870Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-09-14-870Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "a3d50f2267c11cd3dfce277171201e0048b17c244a9311424f6ad3f24d49e1ea", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-09-14-870Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0ceb6e5cf0e676bda5fe8c118ef36615532c0ff7bfe2d462add9eb031b245692", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T10-21-45-693Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-low-flatten-id-sanitized-chars", "case_path": "/benchmark/cases/vitejs__vite/low.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-21-45-693Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-21-45-693Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "80dd97c75089b46f2f81605b54792271ae9038c871d1b3ae9563081a806b427b", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-21-45-693Z-vitejs-vite-low-flatten-id-sanitized-chars-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "143076ca1b96c1d81b7143ec89b13ec74d94e468adaad61848076b3bbf23459c", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T10-41-25-110Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-fast-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:composer-2-fast:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-41-25-110Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-fast-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-41-25-110Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-fast-baseline-attempt-1/result.json", "result_sha256": "7b59d9a8587e4c10e4498e1a7d9047b46967de775a0329df58c94912efa7c5df", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-41-25-110Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-fast-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "9f2b18d6065e1a28641f1fa4b11e6ecc87243ccc9cb72b72e52e18711cf2167e", "invalid_run": false, "harness": "cursor", "model": "composer-2-fast", "effort": "fast", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T10-45-01-040Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:composer-2:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-45-01-040Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-45-01-040Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-baseline-attempt-1/result.json", "result_sha256": "0be3e20c4439250b24ab83077274c2635bf2d99d66e3d787af486bfe324edeff", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-45-01-040Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-composer-2-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "59ad077acd60d1d0a135594aa6e52045a68c37e360130046853f4689c58c73d0", "invalid_run": false, "harness": "cursor", "model": "composer-2", "effort": "standard", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T10-49-09-959Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:gpt-5.5-medium:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-49-09-959Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-medium-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-49-09-959Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-medium-baseline-attempt-1/result.json", "result_sha256": "51013baba3c7a34649043a91b94727cb58930fe9e97fb519dbcf8ff75e007d26", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-49-09-959Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-medium-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "1ea463efe3718b2c979982b7aaf5c51e2f3430880d4c284660a6e33bd58a4c16", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-medium", "effort": "medium", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T10-55-23-050Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:gpt-5.5-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T10-55-23-050Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T10-55-23-050Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-high-baseline-attempt-1/result.json", "result_sha256": "77eab7c13ed8e20774ccb24dde41bcb7aec07d50827c5d4fa6362a35afcc2029", "prompt_bundle_path": "/benchmark/runs/2026-05-06T10-55-23-050Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "f332c9b7ba2d87de870113a28993be039a9384b6ee85e0e6e3d61c4b5a668d37", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T11-00-38-719Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:gpt-5.5-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T11-00-38-719Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T11-00-38-719Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/result.json", "result_sha256": "39f2c854bcb2c9ba9aa0055d80294def17c2db8f95678dae362c33c6fb52890e", "prompt_bundle_path": "/benchmark/runs/2026-05-06T11-00-38-719Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-gpt-5.5-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "0ab9c853dc5e656db799693fccec7b37532c16471adb68d54aa89e0f7a94dc12", "invalid_run": false, "harness": "cursor", "model": "gpt-5.5-extra-high", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T11-07-34-850Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:claude-opus-4-7-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T11-07-34-850Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T11-07-34-850Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/result.json", "result_sha256": "5454e346fb704868be481c9576392ffc53105202e855bbaf81d821bddb369fed", "prompt_bundle_path": "/benchmark/runs/2026-05-06T11-07-34-850Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "d9db5254f12bda8a1fe43ae56286eefaa80b230fb847813409198b66a4f0e0ae", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-high", "effort": "high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": false }, { "run_id": "2026-05-06T12-10-49-285Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:claude-opus-4-7-extra-high:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T12-10-49-285Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T12-10-49-285Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/result.json", "result_sha256": "db159eeefb0d08b5d3f20907453516e3fe015e14d8256dcd288ae1a6616f80da", "prompt_bundle_path": "/benchmark/runs/2026-05-06T12-10-49-285Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-extra-high-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "acaec8c05bb71e329fa411a7fad88ff41c6076993a0491eba47998ec4efcea41", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-xhigh", "effort": "extra-high", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true }, { "run_id": "2026-05-06T12-20-14-913Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "kind": "agent", "case_id": "vitejs-vite-mid-deno-workspace-root", "case_path": "/benchmark/cases/vitejs__vite/mid.yaml", "condition_id": "cursor:claude-opus-4-7-max:baseline", "attempt": 1, "run_dir": "/benchmark/runs/2026-05-06T12-20-14-913Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-max-baseline-attempt-1", "result_path": "/benchmark/runs/2026-05-06T12-20-14-913Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/result.json", "result_sha256": "fb03f420810cb8dcb79708be1c526be55c95b236360514b6e2cd46ca7bba18eb", "prompt_bundle_path": "/benchmark/runs/2026-05-06T12-20-14-913Z-vitejs-vite-mid-deno-workspace-root-agent-cursor-claude-opus-4-7-max-baseline-attempt-1/prompt-bundle.json", "prompt_bundle_sha256": "4f402e61d91a2b968378e918af8fee313862e47da0d376f216fb66e8bff87d38", "invalid_run": false, "harness": "cursor", "model": "claude-opus-4-7-max", "effort": "max", "harness_version": { "name": "cursor", "version_string": "2026.05.01-eea359f", "binary_path": "/.local/bin/agent", "binary_sha256": "8756ac4a808cc90b220416ac8743560aa473a94d6fe5911bb602c250c046c4a3", "captured_at": "2026-05-05T09:22:43.749Z", "raw_version_output": "2026.05.01-eea359f\n", "version_exit_code": 0, "version_signal": null, "version_error": null }, "success": true } ] }