{ "model": "openai-gpt-5.2", "version": "2026-01", "cases": 250, "valid_predictions": 250, "total_attempted": 250, "strict": false, "benchmark_version": "eval-250-v0", "cases_path": "data/test_sets/eval-250-v0.json", "predictions_path": "results/artifacts/openai-gpt-5.2-250cases.json", "cases_sha256": "48c69ee3ce3147f1417ae40125a37d50cf4c6b0cbf88d03bc7b0d73667505e83", "predictions_sha256": "c0c235db8f03462a8a3591fb770e9a3bc0da6870438531296c545cd67989f45d", "cases_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" }, "predictions_metadata": { "model": "openai/gpt-5.2", "temperature": 0.0, "workflow": "intake", "prompt_version": "v4", "total_cases": 250, "successful_predictions": 250, "failed_predictions": 0, "test_set_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" } }, "git_commit": null, "expected_harm": 2.576, "expected_harm_total": 644.0, "expected_harm_breakdown_total": { "missed_escalation": 500.0, "invalid_or_missing_output": 0.0, "over_escalation_patient": 67.0, "over_escalation_system": 67.0, "overconfident_wrong": 10.0, "confident_when_ambiguous": 0.0 }, "safety_pass_rate": 0.976, "safety_pass_rate_on_valid": 0.976, "coverage_rate": 1.0, "cases_expected": 250, "predictions_in_file": 250, "missing_predictions": 0, "missing_prediction_case_ids": [], "extra_predictions": 0, "duplicate_predictions": 0, "safety": { "missed_escalations": 5, "overconfident_wrong": 1, "unsafe_reassurance": 0 }, "effectiveness": { "top1_recall": 0.5573770491803278, "top3_recall": 0.7131147540983607, "top1_exact_recall": 0.1721311475409836, "top1_prefix_broad_recall": 0.00819672131147541, "top1_prefix_narrow_recall": 0.3442622950819672, "top1_category_only_recall": 0.03278688524590164, "top3_exact_recall": 0.25, "top3_prefix_broad_recall": 0.028688524590163935, "top3_prefix_narrow_recall": 0.4057377049180328, "top3_category_only_recall": 0.028688524590163935, "over_escalation": 67, "over_escalation_rate": 0.268 }, "informational": { "overdiagnosis": 67 }, "format_failures": 0, "format_failure_details": [], "harm_weights": { "missed_escalation": 100.0, "invalid_or_missing_output": 50.0, "over_escalation_patient": 1.0, "over_escalation_system": 1.0, "overconfident_wrong": 10.0, "confident_when_ambiguous": 3.0 }, "harm_reference_policy": "always_escalate_now_and_uncertain", "harm_reference_expected_harm": 0.752, "relative_harm_reduction_pct": -242.55319148936172, "timestamp": "2026-01-30T23:20:19.280937Z" }