{ "model_name": "Claude Sonnet 4.5", "model_organization": "Anthropic", "submitting_organization": "Sierra", "submission_date": "2026-02-26", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "claude-sonnet-4-5_enabled_airline_gpt-5.2_4trials.json", "retail": "claude-sonnet-4-5_enabled_retail_gpt-5.2_4trials.json", "telecom": "claude-sonnet-4-5_enabled_telecom_gpt-5.2_4trials.json", "banking_knowledge": "claude-sonnet-4-5_enabled_banking_knowledge_gpt-5.2_4trials.json" }, "references": [], "results": { "airline": { "pass_1": 72.0, "pass_2": 62.0, "pass_3": 54.5, "pass_4": 48.0, "cost": 0.29646 }, "retail": { "pass_1": 72.37, "pass_2": 57.46, "pass_3": 47.37, "pass_4": 39.47, "cost": 0.25494 }, "telecom": { "pass_1": 84.87, "pass_2": 75.44, "pass_3": 68.86, "pass_4": 64.04, "cost": 0.48769 }, "banking_knowledge": { "pass_1": 22.42, "pass_2": 14.09, "pass_3": 11.86, "pass_4": 10.31, "cost": 4.04814, "retrieval_config": "terminal" } }, "reasoning_effort": "enabled", "methodology": { "evaluation_date": "2026-02-24", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using Claude Sonnet 4.5 (claude-sonnet-4-5-20250929) with extended thinking enabled. User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Seed: 300. Banking domain evaluated with terminal-based agentic search retrieval.", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Verified evaluation with full trajectory data available. Standard tau-bench scaffold." } }, "model_release": { "release_date": "2025-09-29", "announcement_url": "https://www.anthropic.com/news/claude-sonnet-4-5", "announcement_title": "Introducing Claude Sonnet 4.5" } }