{ "model_name": "Gemini 3 Flash", "model_organization": "Google", "submitting_organization": "Sierra", "submission_date": "2026-03-02", "contact_info": { "email": "victor@sierra.ai, ben.s@sierra.ai", "name": "Sierra Research Team" }, "is_new": false, "submission_type": "standard", "trajectories_available": true, "trajectory_files": { "airline": "geminiflash-airline.json", "retail": "geminiflash-retail.json", "telecom": "geminiflash-telecom.json", "banking_knowledge": "geminiflash-terminal.json" }, "references": [], "results": { "airline": { "pass_1": 82.5, "pass_2": 76.33, "pass_3": 72.0, "pass_4": 68.0, "cost": null }, "retail": { "pass_1": 76.75, "pass_2": 65.94, "pass_3": 57.89, "pass_4": 51.75, "cost": null }, "telecom": { "pass_1": 91.23, "pass_2": 83.48, "pass_3": 76.54, "pass_4": 70.18, "cost": null }, "banking_knowledge": { "pass_1": 20.62, "pass_2": 10.31, "pass_3": 6.44, "pass_4": 4.12, "cost": null, "retrieval_config": "terminal" } }, "reasoning_effort": "high", "methodology": { "evaluation_date": "2026-03-02", "tau2_bench_version": "0.2.1-dev", "user_simulator": "gpt-5.2", "notes": "Evaluated using vertex_ai/gemini-3-flash-preview with reasoning_effort: high. User simulator: gpt-5.2 with reasoning_effort: low. 4 trials. Seed: 123. Telecom domain had 20 simulations terminated due to too_many_errors (all scored as 0). Banking domain had 52 simulations terminated due to max_steps (all scored as 0). Banking domain evaluated with terminal-based agentic search retrieval (terminal_use).", "verification": { "modified_prompts": false, "omitted_questions": false, "details": "Standard tau-bench scaffold." } }, "model_release": { "release_date": "2025-12-17", "announcement_url": "https://blog.google/products/gemini/gemini-3-flash/", "announcement_title": "Gemini 3 Flash: frontier intelligence built for speed" } }