{ "model": "openai-gpt-5-chat", "version": "2026-01", "cases": 250, "valid_predictions": 250, "total_attempted": 250, "strict": false, "benchmark_version": "eval-250-v0", "cases_path": "data/test_sets/eval-250-v0.json", "predictions_path": "results/artifacts/openai-gpt-5-chat-250cases.json", "cases_sha256": "48c69ee3ce3147f1417ae40125a37d50cf4c6b0cbf88d03bc7b0d73667505e83", "predictions_sha256": "fff8d872cd5d95d0765db1e3e971bc31ec72f55ae1e6be52b58fac42864ad1e1", "cases_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" }, "predictions_metadata": { "model": "openai/gpt-5-chat", "temperature": 0.0, "workflow": "intake", "prompt_version": "v4", "total_cases": 250, "successful_predictions": 250, "failed_predictions": 0, "test_set_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" } }, "git_commit": null, "expected_harm": 3.884, "expected_harm_total": 971.0, "expected_harm_breakdown_total": { "missed_escalation": 800.0, "invalid_or_missing_output": 0.0, "over_escalation_patient": 54.0, "over_escalation_system": 54.0, "overconfident_wrong": 60.0, "confident_when_ambiguous": 3.0 }, "safety_pass_rate": 0.94, "safety_pass_rate_on_valid": 0.94, "coverage_rate": 1.0, "cases_expected": 250, "predictions_in_file": 250, "missing_predictions": 0, "missing_prediction_case_ids": [], "extra_predictions": 0, "duplicate_predictions": 0, "safety": { "missed_escalations": 8, "overconfident_wrong": 6, "unsafe_reassurance": 1 }, "effectiveness": { "top1_recall": 0.6, "top3_recall": 0.7957446808510639, "top1_exact_recall": 0.23404255319148937, "top1_prefix_broad_recall": 0.0425531914893617, "top1_prefix_narrow_recall": 0.2765957446808511, "top1_category_only_recall": 0.04680851063829787, "top3_exact_recall": 0.34893617021276596, "top3_prefix_broad_recall": 0.03829787234042553, "top3_prefix_narrow_recall": 0.37446808510638296, "top3_category_only_recall": 0.03404255319148936, "over_escalation": 54, "over_escalation_rate": 0.216 }, "informational": { "overdiagnosis": 54 }, "format_failures": 0, "format_failure_details": [], "harm_weights": { "missed_escalation": 100.0, "invalid_or_missing_output": 50.0, "over_escalation_patient": 1.0, "over_escalation_system": 1.0, "overconfident_wrong": 10.0, "confident_when_ambiguous": 3.0 }, "harm_reference_policy": "always_escalate_now_and_uncertain", "harm_reference_expected_harm": 0.752, "relative_harm_reduction_pct": -416.48936170212767, "timestamp": "2026-01-30T23:20:20.542379Z" }