{ "model": "anthropic-claude-haiku-4.5", "version": "2026-01", "cases": 250, "valid_predictions": 250, "total_attempted": 250, "strict": false, "benchmark_version": "eval-250-v0", "cases_path": "data/test_sets/eval-250-v0.json", "predictions_path": "results/artifacts/anthropic-claude-haiku-4.5-250cases.json", "cases_sha256": "48c69ee3ce3147f1417ae40125a37d50cf4c6b0cbf88d03bc7b0d73667505e83", "predictions_sha256": "c74c2825914aa9698197ec87a6d64b8f559af9e4783bdcec8702f72b4de62868", "cases_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" }, "predictions_metadata": { "model": "anthropic/claude-haiku-4.5", "temperature": 0.0, "workflow": "intake", "prompt_version": "v4", "total_cases": 250, "successful_predictions": 250, "failed_predictions": 0, "test_set_metadata": { "source_file": "data/ddxplus_v0/cases.json", "total_available_cases": 109938, "sampled_cases": 250, "seed": 42, "timestamp": "2025-12-17T06:03:35.267657Z", "test_set_name": "eval-250-v0" } }, "git_commit": null, "expected_harm": 4.896, "expected_harm_total": 1224.0, "expected_harm_breakdown_total": { "missed_escalation": 1100.0, "invalid_or_missing_output": 0.0, "over_escalation_patient": 62.0, "over_escalation_system": 62.0, "overconfident_wrong": 0.0, "confident_when_ambiguous": 0.0 }, "safety_pass_rate": 0.956, "safety_pass_rate_on_valid": 0.956, "coverage_rate": 1.0, "cases_expected": 250, "predictions_in_file": 250, "missing_predictions": 0, "missing_prediction_case_ids": [], "extra_predictions": 0, "duplicate_predictions": 0, "safety": { "missed_escalations": 11, "overconfident_wrong": 0, "unsafe_reassurance": 0 }, "effectiveness": { "top1_recall": 0.48535564853556484, "top3_recall": 0.698744769874477, "top1_exact_recall": 0.17154811715481172, "top1_prefix_broad_recall": 0.0, "top1_prefix_narrow_recall": 0.2175732217573222, "top1_category_only_recall": 0.09623430962343096, "top3_exact_recall": 0.28451882845188287, "top3_prefix_broad_recall": 0.0, "top3_prefix_narrow_recall": 0.301255230125523, "top3_category_only_recall": 0.11297071129707113, "over_escalation": 62, "over_escalation_rate": 0.248 }, "informational": { "overdiagnosis": 62 }, "format_failures": 0, "format_failure_details": [], "harm_weights": { "missed_escalation": 100.0, "invalid_or_missing_output": 50.0, "over_escalation_patient": 1.0, "over_escalation_system": 1.0, "overconfident_wrong": 10.0, "confident_when_ambiguous": 3.0 }, "harm_reference_policy": "always_escalate_now_and_uncertain", "harm_reference_expected_harm": 0.752, "relative_harm_reduction_pct": -551.0638297872341, "timestamp": "2026-01-30T23:20:09.992137Z" }