{ "model_name": "GPT-5.2", "model_organization": "OpenAI", "submitting_organization": "Sierra", "submission_date": "2026-02-26", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "gpt-5.2_high_airline_gpt-5.2_4trials.json", "retail": "gpt-5.2_high_retail_gpt-5.2_4trials.json", "telecom": "gpt-5.2_high_telecom_gpt-5.2_4trials.json", "banking_knowledge": "gpt-5.2_high_banking_knowledge_gpt-5.2_4trials.json" }, "references": [], "results": { "airline": { "pass_1": 83.0, "pass_2": 78.33, "pass_3": 75.0, "pass_4": 72.0, "cost": 0.11379 }, "retail": { "pass_1": 81.58, "pass_2": 69.59, "pass_3": 59.87, "pass_4": 51.75, "cost": 0.07156 }, "telecom": { "pass_1": 89.69, "pass_2": 82.46, "pass_3": 76.75, "pass_4": 71.93, "cost": 0.09312 }, "banking_knowledge": { "pass_1": 24.74, "pass_2": 16.49, "pass_3": 14.43, "pass_4": 11.34, "cost": null, "retrieval_config": "alltools" } }, "reasoning_effort": "high", "methodology": { "evaluation_date": "2026-05-05", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using AllTools retrieval (BM25 + dense OpenAI text-embedding-3-large + sandboxed shell). User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Seed: 300. Banking_knowledge domain only — other domains intentionally excluded from this comparison; the AllTools setting standardizes retrieval across models.", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Verified evaluation with full trajectory data available. Standard tau-bench scaffold." } }, "model_release": { "release_date": "2025-12-11", "announcement_url": "https://openai.com/index/introducing-gpt-5-2/", "announcement_title": "Introducing GPT-5.2" } }