{
  "benchmark": "BEAM",
  "status": "internal_evidence",
  "dataset": {
    "source": "BEAM (100K split)",
    "license": null,
    "vendored": false
  },
  "run": {
    "commit": null,
    "packageVersion": null,
    "command": "eval:phase-63-live-closure (run-phase63-beam-100k-live-closure-gpt55-evidence-pack-answer-hardening-current)",
    "executionFailures": 0
  },
  "model": {
    "answerModel": "gpt-5.5",
    "judgeModel": "gpt-5.5",
    "sameModelJudge": true
  },
  "metrics": {
    "primary": "answer accuracy (100K, 400 cases)",
    "score": 0.695,
    "baseline": 0.56
  },
  "coverage": {
    "complete": true,
    "note": "fitted evidence-chat recall 0.9621 vs generalization floor 0.6822; baseline 0.56 is the internal no-pack ablation, not an external reference"
  },
  "claimBoundary": {
    "publicClaimAllowed": false,
    "reason": "Pipeline/gate evidence only. Blocked: same-model judge (gpt-5.5 answer + gpt-5.5 judge); fitted-vs-generalization recall gap (0.9621 vs 0.6822); below the 0.75+ improvement target; license/commit/package-version unverified. P67-D must raise answer accuracy to >=0.75 with an independent judge and always report fitted AND generalization recall."
  }
}