{ "model_name": "Gemini 3 Pro", "model_organization": "Google", "submitting_organization": "Sierra", "submission_date": "2026-03-02", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "geminipro-airline.json", "retail": "geminipro-retail.json", "telecom": "geminipro-telecom.json", "banking_knowledge": "geminipro-terminal.json" }, "references": [], "results": { "airline": { "pass_1": 80.5, "pass_2": 74.67, "pass_3": 70.0, "pass_4": 66.0, "cost": null }, "retail": { "pass_1": 75.88, "pass_2": 63.45, "pass_3": 54.39, "pass_4": 47.37, "cost": null }, "telecom": { "pass_1": 91.01, "pass_2": 84.5, "pass_3": 79.17, "pass_4": 74.56, "cost": null }, "banking_knowledge": { "pass_1": 15.72, "pass_2": 8.08, "pass_3": 5.41, "pass_4": 4.12, "cost": null, "retrieval_config": "terminal" } }, "reasoning_effort": "high", "methodology": { "evaluation_date": "2026-03-02", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using vertex_ai/gemini-3-pro-preview with reasoning_effort: high. User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Infrastructure errors treated as failures (reward 0): 3 in airline, 20 in retail, 2 in telecom. Banking domain had 42 simulations terminated due to max_steps (all scored as 0). Banking domain evaluated with terminal-based agentic search retrieval (terminal_use).", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Standard tau-bench scaffold." } }, "model_release": { "release_date": "2025-11-18", "announcement_url": "https://blog.google/products/gemini/gemini-3/", "announcement_title": "Gemini 3: Introducing the latest Gemini AI model from Google" } }