{ "webvoyager": [ { "rank": 1, "systemName": "Alumnium", "organization": "Alumnium", "scoreDisplay": "98.5%", "scoreValue": 98.5, "sourceUrl": "https://alumnium.ai/blog/webvoyager-benchmark/", "repoUrl": "https://github.com/alumnium-hq/alumnium", "notesShort": "Claude Code orchestrating Alumnium MCP with GPT-5 Nano + Selenium; accessibility-tree parsing with visual reasoning.", "reportedAt": "2026-03", "isNew": true }, { "rank": 2, "systemName": "Surfer 2", "organization": "H Company", "scoreDisplay": "97.1%", "scoreValue": 97.1, "sourceUrl": "https://hcompany.ai/surfer-2", "notesShort": "System-level orchestration with submitter-defined setup details.", "reportedAt": "2025-10" }, { "rank": 3, "systemName": "Magnitude", "organization": "Magnitude", "scoreDisplay": "93.9%", "scoreValue": 93.9, "sourceUrl": "https://magnitude.run/webvoyager", "repoUrl": "https://github.com/magnitudedev/browser-agent", "notesShort": "Open-source architecture utilizing a modular agentic stack.", "reportedAt": "2025-07" }, { "rank": 4, "systemName": "Surfer-H + Holo1", "organization": "H Company", "scoreDisplay": "92.2%", "scoreValue": 92.2, "sourceUrl": "https://arxiv.org/pdf/2506.02865", "repoUrl": "https://github.com/hcompai/surfer-h-cli", "notesShort": "Multi-modal action kernels integrated via H-Company research.", "reportedAt": "2025-06" }, { "rank": 5, "systemName": "Browserable", "organization": "Browserable", "scoreDisplay": "90.4%", "scoreValue": 90.4, "sourceUrl": "https://www.browserable.ai/blog/web-voyager-benchmark", "repoUrl": "https://github.com/browserable/browserable", "notesShort": "Fine-tuned browser control models within a commercial framework.", "reportedAt": "2025-04" }, { "rank": 6, "systemName": "Browser Use", "organization": "Browser Use", "scoreDisplay": "89.1%", "scoreValue": 89.1, "sourceUrl": "https://browser-use.com/posts/sota-technical-report", "repoUrl": "https://github.com/browser-use/browser-use", "notesShort": "Multi-step orchestration framework for open-source automation.", "reportedAt": "2024-12" }, { "rank": 7, "systemName": "GLM-5V-Turbo", "organization": "Z.ai", "scoreDisplay": "88.5%", "scoreValue": 88.5, "sourceUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo", "notesShort": "Multimodal vision model optimized for GUI automation and coding tasks.", "reportedAt": "2026-04", "isNew": true }, { "rank": 8, "systemName": "Agent Kura", "organization": "Kura", "scoreDisplay": "87.0%", "scoreValue": 87.0, "sourceUrl": "https://www.trykura.com/benchmarks", "notesShort": "602/643 tasks (41 removed for invalid/auth issues); reported on trykura.com.", "reportedAt": "2024-11" }, { "rank": 8, "systemName": "Operator", "organization": "OpenAI", "scoreDisplay": "87%", "scoreValue": 87, "sourceUrl": "https://openai.com/index/introducing-operator/", "notesShort": "Native browser integration using proprietary vision-control models.", "reportedAt": "2025-01" }, { "rank": 10, "systemName": "Notte", "organization": "Notte", "scoreDisplay": "86.2%", "scoreValue": 86.2, "sourceUrl": "https://github.com/nottelabs/open-operator-evals", "repoUrl": "https://github.com/nottelabs/notte", "notesShort": "Self-reported score from Notte's open-operator-evals harness (79.0% under LLM evaluation).", "reportedAt": "2025-04" }, { "rank": 11, "systemName": "Skyvern 2.0", "organization": "Skyvern", "scoreDisplay": "85.85%", "scoreValue": 85.85, "sourceUrl": "https://www.skyvern.com/blog/skyvern-2-0-state-of-the-art-web-navigation-with-85-8-on-webvoyager-eval", "repoUrl": "https://github.com/Skyvern-AI/skyvern", "notesShort": "DOM-level reasoning coupled with real-time error-correction.", "reportedAt": "2025-01" }, { "rank": 12, "systemName": "Project Mariner", "organization": "Google", "scoreDisplay": "83.5%", "scoreValue": 83.5, "sourceUrl": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "notesShort": "Gemini-powered reasoning with precise visual grounding.", "reportedAt": "2024-12" }, { "rank": 13, "systemName": "Agent-E", "organization": "Emergence AI", "scoreDisplay": "73.2%", "scoreValue": 73.2, "sourceUrl": "https://arxiv.org/abs/2407.13032", "repoUrl": "https://github.com/EmergenceAI/Agent-E", "notesShort": "Hierarchical planning modules within a multi-agent framework.", "reportedAt": "2024-07" }, { "rank": 14, "systemName": "WebSight", "organization": "Academic Research", "scoreDisplay": "68%", "scoreValue": 68, "sourceUrl": "https://arxiv.org/abs/2508.16987", "notesShort": "Navigation system prioritizing visual-only perceptual inputs.", "reportedAt": "2025-08" }, { "rank": 15, "systemName": "Runner H 0.1", "organization": "H Company", "scoreDisplay": "67%", "scoreValue": 67, "sourceUrl": "https://www.hcompany.ai/blog/a-research-update", "notesShort": "Foundational agent architecture for general web interaction.", "reportedAt": "2025-03" }, { "rank": 16, "systemName": "WebVoyager", "organization": "Academic Research", "scoreDisplay": "59.1%", "scoreValue": 59.1, "sourceUrl": "https://arxiv.org/abs/2401.13919", "repoUrl": "https://github.com/MinorJerry/WebVoyager", "notesShort": "Baseline implementation using standard multimodal LLM control.", "reportedAt": "2024-01" }, { "rank": 17, "systemName": "Anthropic Computer Use 3.5", "organization": "Anthropic", "scoreDisplay": "56.0%", "scoreValue": 56.0, "sourceUrl": "https://www.trykura.com/benchmarks", "notesShort": "Sampled 50/602 tasks for direct comparison; reported on trykura.com.", "reportedAt": "2024-11" }, { "rank": 18, "systemName": "WILBUR", "organization": "Bardeen / UC Berkeley", "scoreDisplay": "53%", "scoreValue": 53, "sourceUrl": "https://arxiv.org/abs/2404.05902", "notesShort": "Research implementation using black-box optimization techniques.", "reportedAt": "2024-04" }, { "rank": 19, "systemName": "GPT-4 (All Tools)", "organization": "OpenAI", "scoreDisplay": "30.8%", "scoreValue": 30.8, "sourceUrl": "https://arxiv.org/abs/2401.13919", "notesShort": "ChatGPT integrated tool baseline from original WebVoyager paper; reported on arxiv.org.", "reportedAt": "2024-01" } ] }