{ "model_name": "Qwen3.5-397B-A17B", "model_organization": "Alibaba Cloud", "submitting_organization": "Sierra", "submission_date": "2026-03-02", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "qwen3.5-397b-a17b_enabled_airline_gpt-5.2_4trials.json", "retail": "qwen3.5-397b-a17b_enabled_retail_gpt-5.2_4trials.json", "telecom": "qwen3.5-397b-a17b_enabled_telecom_gpt-5.2_4trials.json", "banking_knowledge": "qwen3.5-397b-a17b_enabled_banking_knowledge_gpt-5.2_4trials.json" }, "references": [], "results": { "airline": { "pass_1": 81.5, "pass_2": 75.67, "pass_3": 71.5, "pass_4": 68.0, "cost": null }, "retail": { "pass_1": 84.43, "pass_2": 74.42, "pass_3": 66.45, "pass_4": 59.65, "cost": null }, "telecom": { "pass_1": 97.81, "pass_2": 95.76, "pass_3": 93.86, "pass_4": 92.11, "cost": null }, "banking_knowledge": { "pass_1": 9.79, "pass_2": 6.36, "pass_3": 5.41, "pass_4": 5.15, "cost": null, "retrieval_config": "text-emb-3-large" } }, "reasoning_effort": "enabled", "methodology": { "evaluation_date": "2026-02-27", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using Qwen3.5-397B-A17B with thinking enabled. User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Seed: 300. Temperature: 1.0, top_p: 0.95. Self-hosted via vLLM.", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Standard tau-bench scaffold. Clean 4-trial runs with no infrastructure errors across all domains." } }, "model_release": { "release_date": "2026-02-16", "announcement_url": "https://www.alibabacloud.com/blog/qwen3-5-towards-native-multimodal-agents_602894", "announcement_title": "Qwen3.5: Towards Native Multimodal Agents" } }