rank,model,org,reasoning,avg_score,green_rate,red_rate,score_2,score_1,score_0,nonsense_count,error_count 1,anthropic/claude-sonnet-4.6@reasoning=high,anthropic,high,1.87,0.91,0.03,91,6,3,100,0 2,anthropic/claude-sonnet-4.6@reasoning=none,anthropic,none,1.86,0.89,0.02,89,9,2,100,0 3,anthropic/claude-opus-4.5@reasoning=high,anthropic,high,1.8433,0.9,0.02,90,8,2,100,0 4,anthropic/claude-opus-4.6@reasoning=high,anthropic,high,1.8367,0.87,0.03,87,10,3,100,0 5,anthropic/claude-opus-4.6@reasoning=none,anthropic,none,1.7933,0.83,0.03,83,14,3,100,0 6,qwen/qwen3.5-397b-a17b@reasoning=high,qwen,high,1.7033,0.78,0.05,78,17,5,100,0 7,anthropic/claude-sonnet-4.5@reasoning=high,anthropic,high,1.6767,0.79,0.08,79,13,8,100,0 8,anthropic/claude-opus-4.5@reasoning=none,anthropic,none,1.67,0.79,0.11,79,10,11,100,0 9,anthropic/claude-haiku-4.5@reasoning=high,anthropic,high,1.64,0.77,0.11,77,12,11,100,0 10,anthropic/claude-haiku-4.5@reasoning=none,anthropic,none,1.6267,0.71,0.08,71,21,8,100,0 11,anthropic/claude-sonnet-4.5@reasoning=none,anthropic,none,1.59,0.74,0.13,74,13,13,100,0 12,qwen/qwen3.5-397b-a17b@reasoning=none,qwen,none,1.5267,0.69,0.15,69,16,15,100,0 13,x-ai/grok-4.20-multi-agent-beta@reasoning=xhigh,x-ai,xhigh,1.4333,0.64,0.2,64,16,20,100,0 14,x-ai/grok-4.20-multi-agent-beta@reasoning=low,x-ai,low,1.43,0.67,0.19,67,14,19,100,0 15,x-ai/grok-4.20-beta@reasoning=xhigh,x-ai,xhigh,1.27,0.54,0.25,54,21,25,100,0 16,x-ai/grok-4.20-beta@reasoning=low,x-ai,low,1.2567,0.56,0.26,56,18,26,100,0 17,openai/gpt-5.4@reasoning=none,openai,none,1.25,0.48,0.16,48,36,16,100,0 18,moonshotai/kimi-k2.5@reasoning=none,moonshotai,none,1.2267,0.52,0.28,52,20,28,100,0 19,openai/gpt-5.4@reasoning=xhigh,openai,xhigh,1.1667,0.42,0.2,42,38,20,100,0 20,anthropic/claude-3.7-sonnet:thinking@reasoning=default,anthropic,default,1.1433,0.49,0.34,49,17,34,100,0 21,openai/gpt-5.2-codex@reasoning=low,openai,low,1.1367,0.45,0.29,45,26,29,100,0 22,google/gemini-3-pro-preview@reasoning=low,google,low,1.1267,0.48,0.37,48,15,37,100,0 23,anthropic/claude-opus-4.1@reasoning=none,anthropic,none,1.1233,0.43,0.29,43,28,29,100,0 24,anthropic/claude-opus-4.1@reasoning=high,anthropic,high,1.1167,0.42,0.29,42,29,29,100,0 25,anthropic/claude-3.5-haiku@reasoning=default,anthropic,default,1.1067,0.5,0.35,50,15,35,100,0 26,openai/gpt-5.2@reasoning=none,openai,none,1.1,0.38,0.23,38,39,23,100,0 27,anthropic/claude-3.5-sonnet@reasoning=default,anthropic,default,1.0633,0.45,0.36,45,19,36,100,0 28,openai/gpt-5.3-chat@reasoning=default,openai,default,1.06,0.4,0.31,40,29,31,100,0 29,openai/gpt-5.1-chat@reasoning=default,openai,default,1.0533,0.45,0.36,45,19,36,100,0 30,google/gemini-3.1-pro-preview@reasoning=low,google,low,1.0467,0.37,0.31,37,32,31,100,0 31,openai/gpt-5.4-mini@reasoning=xhigh,openai,xhigh,1.0467,0.31,0.22,31,47,22,100,0 32,anthropic/claude-3.7-sonnet@reasoning=default,anthropic,default,1.0267,0.43,0.38,43,19,38,100,0 33,openrouter/hunter-alpha@reasoning=none,stealth,none,1.02,0.43,0.38,43,19,38,100,0 34,openai/gpt-5.2@reasoning=high,openai,high,1.02,0.28,0.22,28,50,22,100,0 35,openai/gpt-5-codex@reasoning=default,openai,default,1.0067,0.39,0.35,39,26,35,100,0 36,openai/gpt-5.2-codex@reasoning=xhigh,openai,xhigh,1.0067,0.39,0.33,39,28,33,100,0 37,anthropic/claude-opus-4@reasoning=default,anthropic,default,0.99,0.34,0.3,34,36,30,100,0 38,openai/gpt-5.4-mini@reasoning=high,openai,high,0.9833,0.32,0.31,32,37,31,100,0 39,openai/gpt-5.2-codex@reasoning=high,openai,high,0.9733,0.37,0.36,37,27,36,100,0 40,moonshotai/kimi-k2.5@reasoning=high,moonshotai,high,0.9533,0.31,0.34,31,35,34,100,0 41,google/gemini-3-pro-preview@reasoning=high,google,high,0.9433,0.36,0.41,36,23,41,100,0 42,openai/gpt-5.4-mini@reasoning=none,openai,none,0.9433,0.32,0.32,32,36,32,100,0 43,openrouter/healer-alpha@reasoning=none,stealth,none,0.9167,0.37,0.43,37,20,43,100,0 44,openai/gpt-5.1@reasoning=default,openai,default,0.8767,0.25,0.31,25,44,31,100,0 45,anthropic/claude-sonnet-4@reasoning=none,anthropic,none,0.87,0.29,0.42,29,29,42,100,0 46,google/gemini-3.1-pro-preview@reasoning=high,google,high,0.8667,0.31,0.46,31,23,46,100,0 47,openrouter/hunter-alpha@reasoning=xhigh,stealth,xhigh,0.8633,0.35,0.45,35,20,45,100,0 48,anthropic/claude-sonnet-4@reasoning=high,anthropic,high,0.8533,0.3,0.43,30,27,43,100,0 49,openai/gpt-5.1-codex@reasoning=default,openai,default,0.8433,0.32,0.43,32,25,43,100,0 50,meta-llama/llama-4-maverick@reasoning=default,meta,default,0.8333,0.28,0.42,28,30,42,100,0 51,openai/gpt-5.2-chat@reasoning=default,openai,default,0.83,0.27,0.4,27,33,40,100,0 52,openai/gpt-5.3-codex@reasoning=low,openai,low,0.83,0.24,0.37,24,39,37,100,0 53,openai/gpt-5@reasoning=default,openai,default,0.82,0.21,0.37,21,42,37,100,0 54,z-ai/glm-5@reasoning=high,z-ai,high,0.7267,0.28,0.53,28,19,53,100,0 55,openai/gpt-5.3-codex@reasoning=high,openai,high,0.69,0.2,0.46,20,34,46,100,0 56,openai/gpt-5.3-codex@reasoning=xhigh,openai,xhigh,0.6767,0.19,0.5,19,31,50,100,0 57,openai/o3@reasoning=default,openai,default,0.6667,0.26,0.58,26,16,58,100,0 58,openrouter/healer-alpha@reasoning=xhigh,stealth,xhigh,0.6633,0.26,0.57,26,17,57,100,0 59,openai/gpt-5-chat@reasoning=default,openai,default,0.6367,0.18,0.52,18,30,52,100,0 60,google/gemini-2.5-pro@reasoning=default,google,default,0.5967,0.2,0.58,20,22,58,100,0 61,z-ai/glm-5@reasoning=none,z-ai,none,0.5867,0.2,0.58,20,22,58,100,0 62,x-ai/grok-4.1-fast@reasoning=high,x-ai,high,0.5767,0.19,0.59,19,22,59,100,0 63,meta-llama/llama-4-scout@reasoning=default,meta,default,0.5633,0.19,0.61,19,20,61,100,0 64,meta-llama/llama-3.1-8b-instruct@reasoning=default,meta,default,0.5633,0.14,0.61,14,25,61,100,0 65,xiaomi/mimo-v2-flash@reasoning=none,xiaomi,none,0.55,0.16,0.56,16,28,56,100,0 66,google/gemini-2.5-flash@reasoning=default,google,default,0.54,0.19,0.67,19,14,67,100,0 67,openai/gpt-5.4-nano@reasoning=high,openai,high,0.5233,0.14,0.56,14,30,56,100,0 68,openai/gpt-5.4-nano@reasoning=none,openai,none,0.4933,0.13,0.61,13,26,61,100,0 69,openai/gpt-4o-2024-08-06@reasoning=default,openai,default,0.4933,0.12,0.61,12,27,61,100,0 70,google/gemini-2.0-flash-001@reasoning=default,google,default,0.48,0.15,0.66,15,19,66,100,0 71,deepseek/deepseek-v3.2@reasoning=high,deepseek,high,0.46,0.13,0.64,13,23,64,100,0 72,minimax/minimax-m2.5@reasoning=low,minimax,low,0.4433,0.09,0.62,9,29,62,100,0 73,openai/gpt-4.1@reasoning=default,openai,default,0.4333,0.14,0.67,14,19,67,100,0 74,openai/gpt-5.4-nano@reasoning=xhigh,openai,xhigh,0.4167,0.1,0.7,10,20,70,100,0 75,google/gemini-3-flash-preview@reasoning=high,google,high,0.4067,0.1,0.71,10,19,71,100,0 76,bytedance-seed/seed-1.6@reasoning=high,bytedance-seed,high,0.4067,0.07,0.61,7,32,61,100,0 77,google/gemini-3.1-flash-lite-preview@reasoning=default,google,default,0.4033,0.11,0.74,11,15,74,100,0 78,xiaomi/mimo-v2-flash@reasoning=high,xiaomi,high,0.3967,0.13,0.72,13,15,72,100,0 79,bytedance-seed/seed-1.6@reasoning=none,bytedance-seed,none,0.3967,0.11,0.69,11,20,69,100,0 80,openai/gpt-oss-120b@reasoning=low,openai,low,0.3967,0.11,0.67,11,22,67,100,0 81,deepseek/deepseek-v3.2@reasoning=none,deepseek,none,0.3967,0.1,0.69,10,21,69,100,0 82,minimax/minimax-m2.5@reasoning=high,minimax,high,0.3933,0.08,0.66,8,26,66,100,0 83,anthropic/claude-3-haiku@reasoning=default,anthropic,default,0.3433,0.1,0.75,10,15,75,100,0 84,google/gemini-3-flash-preview@reasoning=none,google,none,0.3367,0.1,0.74,10,16,74,100,0 85,x-ai/grok-4.1-fast@reasoning=none,x-ai,none,0.3,0.1,0.8,10,10,80,100,0 86,openai/gpt-oss-120b@reasoning=high,openai,high,0.2967,0.05,0.72,5,23,72,100,0 87,openai/o4-mini@reasoning=high,openai,high,0.2933,0.04,0.71,4,25,71,100,0 88,openai/o4-mini@reasoning=low,openai,low,0.2767,0.08,0.79,8,13,79,100,0 89,baidu/ernie-4.5-300b-a47b@reasoning=default,baidu,default,0.2667,0.04,0.74,4,22,74,100,0 90,prime-intellect/intellect-3@reasoning=low,prime-intellect,low,0.23,0.07,0.82,7,11,82,100,0 91,prime-intellect/intellect-3@reasoning=high,prime-intellect,high,0.19,0.05,0.85,5,10,85,100,0 92,mistralai/mistral-large-2512@reasoning=default,mistralai,default,0.1733,0.02,0.85,2,13,85,100,0 93,openai/gpt-4o-mini-2024-07-18@reasoning=default,openai,default,0.1567,0.02,0.86,2,12,86,100,0 94,google/gemma-3-27b-it@reasoning=default,google,default,0.1467,0.03,0.88,3,9,88,100,0