{ "benchmark": "LoCoMo", "status": "paused_boundary", "dataset": { "source": "https://github.com/snap-research/locomo", "license": "CC BY-NC 4.0", "vendored": false }, "run": { "commit": "2b2ec71", "packageVersion": null, "command": "eval:phase-65-smoke (representative conv-1, 199 questions, live --evidence-pack)", "executionFailures": 0 }, "model": { "answerModel": "gpt-5.5", "judgeModel": null, "sameModelJudge": false }, "metrics": { "primary": "answer accuracy (representative conv-1, 199 questions)", "score": 0.02, "baseline": null }, "coverage": { "complete": false, "note": "representative single-conversation pressure run, not the full benchmark; this is a banked retrieval-boundary finding, not a performance result" }, "claimBoundary": { "publicClaimAllowed": false, "reason": "Paused at a retrieval boundary. Deterministic scoring (no judge), but answer accuracy 0.020 with exact gold-turn recall ~0.07-0.08 and zero-retrieval ~0.92; positional windows, query expansion, and LLM turn-captioning were all ruled out under the lexical/rules substrate. Not claimable; reopen only as P65-R003 when a real neural embedding endpoint is available. CC BY-NC 4.0 is non-commercial." } }