{ "model_name": "Claude Opus 4.5", "model_organization": "Anthropic", "submitting_organization": "Sierra", "submission_date": "2026-02-26", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "claude-opus-4-5_high_airline_gpt-5.2_4trials.json", "retail": "claude-opus-4-5_high_retail_gpt-5.2_4trials.json", "telecom": "claude-opus-4-5_high_telecom_gpt-5.2_4trials.json", "banking_knowledge": "claude-opus-4-5_high_banking_knowledge_gpt-5.2_4trials.json" }, "references": [], "results": { "airline": { "pass_1": 84.0, "pass_2": 77.67, "pass_3": 73.5, "pass_4": 70.0, "cost": 0.39919 }, "retail": { "pass_1": 79.61, "pass_2": 67.4, "pass_3": 58.77, "pass_4": 51.75, "cost": 0.38695 }, "telecom": { "pass_1": 92.32, "pass_2": 86.11, "pass_3": 81.36, "pass_4": 78.07, "cost": 0.71992 }, "banking_knowledge": { "pass_1": 21.39, "pass_2": 13.4, "pass_3": 10.31, "pass_4": 8.25, "cost": null, "retrieval_config": "alltools" } }, "reasoning_effort": "high", "methodology": { "evaluation_date": "2026-05-05", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using AllTools retrieval (BM25 + dense OpenAI text-embedding-3-large + sandboxed shell). User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Seed: 300. Banking_knowledge domain only — other domains intentionally excluded from this comparison; the AllTools setting standardizes retrieval across models.", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Verified evaluation with full trajectory data available. Standard tau-bench scaffold." } }, "model_release": { "release_date": "2025-11-24", "announcement_url": "https://www.anthropic.com/news/claude-opus-4-5", "announcement_title": "Introducing Claude Opus 4.5" } }