model,AMPS_Hard,code_completion,code_generation,connections,consecutive_events,integrals_with_game,javascript,logic_with_navigation,math_comp,olympiad,paraphrase,plot_unscrambling,python,simplify,spatial,story_generation,summarize,tablejoin,tablereformat,theory_of_mind,typescript,typos,zebra_puzzle arcee-trinity-large-preview,61.0,60.87,70.423,34.833,6.556,8.0,0.0,2.0,47.059,63.669,12.55,31.617,10.0,11.0,46.0,10.6,14.617,20.308,94.118,26.923,0.0,60.0,7.5 claude-4-1-opus-20250805-base,89.0,76.087,76.056,86.0,2.223,10.0,45.0,42.0,65.686,86.619,19.15,60.257,75.0,25.583,62.0,31.017,27.917,37.827,96.078,48.077,40.0,84.0,11.5 claude-4-1-opus-20250805-thinking-32k,93.0,76.087,73.239,93.833,4.641,26.0,45.0,70.0,85.294,88.483,35.35,60.458,70.0,44.4,90.0,44.683,45.183,44.25,98.039,67.308,30.0,64.0,62.0 claude-4-sonnet-20250514-base,84.0,82.609,78.873,74.833,2.833,6.0,25.0,28.0,70.588,80.841,16.983,54.209,65.0,22.183,60.0,24.75,26.8,35.25,94.118,57.692,25.0,84.0,13.0 claude-4-sonnet-20250514-thinking-64k,86.0,76.087,78.873,82.833,24.21,27.0,30.0,66.0,85.294,83.702,41.35,55.898,60.0,46.0,96.0,48.733,41.267,41.635,98.039,61.538,30.0,80.0,52.5 claude-haiku-4-5-20251001,88.0,73.913,70.423,71.333,2.988,4.0,30.0,18.0,68.627,71.259,15.483,37.804,45.0,18.95,58.0,16.417,20.167,38.269,94.118,50.0,25.0,62.0,9.75 claude-haiku-4-5-20251001-thinking-64k,97.0,69.565,76.056,84.833,39.232,38.0,40.0,54.0,95.098,80.03,42.65,42.53,65.0,45.367,90.0,57.267,53.833,42.577,96.078,44.231,20.0,72.0,58.5 claude-opus-4-5-20251101-high-effort,90.0,73.913,81.69,91.0,1.951,8.0,55.0,62.0,80.392,87.388,22.917,62.275,75.0,25.2,80.0,25.267,32.983,38.808,96.078,59.615,55.0,78.0,18.0 claude-opus-4-5-20251101-low-effort,86.0,76.087,80.282,86.0,1.749,7.0,45.0,50.0,76.471,86.942,23.5,61.536,65.0,30.6,74.0,27.5,34.217,32.712,98.039,50.0,40.0,84.0,17.25 claude-opus-4-5-20251101-medium-effort,88.0,73.913,83.099,89.667,2.239,7.0,65.0,58.0,83.333,86.938,22.817,62.303,70.0,27.317,82.0,31.733,30.567,36.346,98.039,53.846,55.0,84.0,19.0 claude-opus-4-5-20251101-thinking-64k-high-effort,99.0,80.435,78.873,99.333,79.363,78.0,50.0,68.0,95.098,89.458,65.667,66.45,85.0,54.95,96.0,65.85,63.717,45.923,98.039,78.846,55.0,78.0,77.5 claude-opus-4-5-20251101-thinking-64k-low-effort,85.0,76.087,78.873,99.333,27.144,44.0,45.0,68.0,90.196,87.415,45.767,60.92,70.0,52.767,84.0,54.783,49.933,33.885,98.039,55.769,35.0,82.0,74.0 claude-opus-4-5-20251101-thinking-64k-medium-effort,98.0,76.087,78.873,98.333,78.026,62.0,55.0,74.0,92.157,88.805,62.817,62.718,70.0,56.217,98.0,66.8,56.233,40.981,98.039,73.077,50.0,80.0,83.75 claude-opus-4-6-thinking-auto-high-effort,97.0,76.087,80.282,99.333,63.467,73.0,60.0,80.0,95.098,92.17,62.817,66.476,80.0,59.15,98.0,66.133,65.15,48.173,98.039,82.692,45.0,84.0,94.0 claude-sonnet-4-5-20250929,91.0,76.087,76.056,80.667,6.341,8.0,45.0,32.0,68.627,82.852,19.633,57.328,80.0,19.367,70.0,28.183,26.9,36.635,98.039,51.923,20.0,90.0,15.25 claude-sonnet-4-5-20250929-thinking-64k,97.0,80.435,80.282,91.333,27.379,47.0,40.0,68.0,90.196,83.058,53.417,62.023,75.0,47.867,96.0,58.4,53.717,45.481,98.039,78.846,45.0,76.0,67.5 claude-sonnet-4-6-thinking-auto-high-effort,78.0,78.261,81.69,97.333,82.9,87.0,50.0,76.0,93.137,87.998,61.833,59.747,70.0,60.933,100.0,64.867,68.033,49.192,96.078,75.0,50.0,76.0,94.5 claude-sonnet-4-6-thinking-auto-medium-effort,76.0,78.261,80.282,99.333,92.683,90.0,50.0,70.0,94.118,87.858,59.4,56.975,75.0,62.933,100.0,68.9,61.65,45.077,96.078,73.077,55.0,72.0,96.0 claude-sonnet-4-6-thinking-auto-low-effort,75.0,73.913,74.648,96.833,88.391,72.0,60.0,68.0,89.216,85.312,43.9,53.327,75.0,46.733,96.0,59.033,56.867,37.404,98.039,55.769,55.0,64.0,90.0 claude-opus-4-7-high-effort,98.0,80.435,85.915,89.333,89.46,74.0,60.0,68.0,94.118,91.054,52.983,58.102,70.0,48.233,100.0,58.117,58.667,44.308,98.039,67.308,55.0,76.0,100.0 claude-opus-4-7-xhigh-effort,98.0,78.261,85.915,92.333,89.521,85.0,55.0,70.0,98.039,91.376,60.067,61.409,65.0,54.8,100.0,60.617,61.883,47.231,98.039,80.769,60.0,80.0,100.0 claude-opus-4-7-low-effort,99.0,78.261,74.648,87.833,87.619,31.0,60.0,66.0,87.255,88.107,45.883,59.837,65.0,46.083,90.0,48.2,44.283,41.135,98.039,53.846,55.0,76.0,89.5 claude-opus-4-7-medium-effort,99.0,78.261,81.69,89.333,86.326,63.0,55.0,66.0,93.137,90.38,50.217,60.256,70.0,46.233,96.0,50.217,56.8,42.885,98.039,65.385,45.0,80.0,92.75 claude-opus-4-8-high-effort,96.0,73.913,78.873,100.0,89.034,46.0,65.0,80.0,95.098,90.903,62.233,64.932,70.0,59.0,100.0,64.417,67.8,47.058,98.039,80.769,45.0,80.0,98.0 claude-opus-4-8-low-effort,98.0,78.261,74.648,97.0,85.614,38.0,35.0,70.0,95.098,90.851,54.25,63.043,40.0,52.917,100.0,63.35,65.95,45.558,98.039,67.308,40.0,72.0,96.75 claude-opus-4-8-medium-effort,98.0,76.087,81.69,95.833,87.689,45.0,65.0,76.0,93.137,90.378,54.5,61.345,65.0,54.317,100.0,60.933,62.75,45.904,98.039,63.462,45.0,84.0,98.5 claude-opus-4-8-xhigh-effort,98.0,78.261,80.282,97.833,89.205,51.0,55.0,82.0,97.059,91.207,64.633,70.419,60.0,62.85,98.0,68.317,73.983,47.788,98.039,78.846,65.0,76.0,100.0 deepseek-v3.2,95.0,73.913,77.465,81.833,1.678,10.0,40.0,42.0,76.471,74.346,20.6,44.885,70.0,23.967,76.0,20.683,27.0,35.385,98.039,50.0,30.0,66.0,9.0 deepseek-v3.2-exp,92.0,71.739,74.648,79.333,2.602,13.0,30.0,40.0,75.49,77.04,16.083,45.455,55.0,20.85,80.0,16.717,23.667,32.135,98.039,50.0,25.0,72.0,12.0 deepseek-v3.2-exp-thinking,96.0,73.913,66.197,87.667,8.947,59.0,25.0,70.0,92.157,82.424,34.333,47.526,50.0,41.0,88.0,42.517,47.233,45.558,100.0,69.231,20.0,78.0,30.25 deepseek-v3.2-thinking,97.0,63.043,66.197,93.5,8.402,63.0,30.0,64.0,95.098,85.037,41.817,45.726,60.0,48.217,98.0,55.117,47.6,41.596,100.0,76.923,30.0,72.0,69.75 deepseek-v4-pro,98.0,69.565,70.423,98.0,75.421,78.0,40.0,64.0,96.078,90.624,56.217,56.391,80.0,55.933,94.0,67.75,69.5,48.192,100.0,80.769,50.0,80.0,92.0 deepseek-v4-flash,98.0,65.217,73.239,89.5,59.915,37.0,40.0,64.0,97.059,86.531,62.383,46.872,70.0,54.05,96.0,67.267,68.85,46.115,98.039,73.077,40.0,74.0,49.25 devstral-2512,80.0,67.391,66.197,34.333,3.517,7.0,40.0,16.0,60.784,62.279,12.35,38.674,70.0,13.217,56.0,11.533,16.9,21.75,92.157,38.462,20.0,64.0,0.5 elephant-alpha,94.0,50.0,63.38,33.5,4.42,8.0,0.0,20.0,75.49,52.493,24.883,13.736,5.0,27.083,80.0,35.4,31.233,22.942,88.235,46.154,0.0,36.0,13.75 gemini-2.5-flash-06-05-highthinking,85.0,63.043,69.014,70.167,2.566,21.0,10.0,32.0,87.255,81.762,25.433,52.657,40.0,22.467,86.0,32.917,33.167,41.327,98.039,42.308,0.0,64.0,18.25 gemini-2.5-flash-lite-highthinking,93.0,65.217,67.606,61.833,6.622,11.0,5.0,42.0,73.529,66.633,16.6,32.108,10.0,23.35,84.0,27.75,24.6,36.462,98.039,34.615,0.0,62.0,12.75 gemini-2.5-flash-lite-preview-09-2025-highthinking,93.0,67.391,63.38,63.0,8.346,17.0,5.0,24.0,83.333,66.279,27.183,32.803,0.0,24.283,68.0,32.733,28.233,37.25,98.039,40.385,0.0,62.0,12.25 gemini-2.5-flash-preview-09-2025-highthinking,96.0,67.391,67.606,77.667,37.524,32.0,15.0,60.0,89.216,84.188,24.467,52.362,40.0,26.95,80.0,30.767,28.533,47.385,98.039,42.308,15.0,66.0,23.5 gemini-2.5-pro-06-05-highthinking,74.0,73.913,77.465,87.5,11.844,24.0,30.0,68.0,89.216,86.047,28.6,62.987,45.0,34.467,98.0,32.15,37.05,44.962,98.039,69.231,25.0,76.0,48.0 gemini-3-flash-preview-high,98.0,71.739,76.056,100.0,74.837,56.0,40.0,72.0,93.137,89.56,72.983,65.694,45.0,70.133,96.0,71.467,84.867,51.442,98.039,82.692,35.0,88.0,47.5 gemini-3-flash-preview-minimal,94.0,78.261,78.873,96.5,7.224,12.0,40.0,44.0,84.314,82.081,19.767,59.439,55.0,30.033,82.0,30.7,32.783,39.654,98.039,51.923,35.0,80.0,18.75 gemini-3-pro-preview-11-2025-high,97.0,71.739,77.465,100.0,73.342,45.0,60.0,70.0,95.098,90.25,64.317,69.863,70.0,58.167,98.0,66.8,74.1,51.788,98.039,76.923,35.0,84.0,64.75 gemini-3-pro-preview-11-2025-low,97.0,65.217,76.056,99.0,58.141,32.0,50.0,68.0,92.157,89.779,16.133,61.475,70.0,27.717,98.0,28.267,35.4,44.558,98.039,63.462,45.0,78.0,53.0 gemini-3.1-pro-preview-high,98.0,78.261,74.648,100.0,85.239,78.0,55.0,72.0,96.078,92.102,79.867,74.128,85.0,71.05,98.0,77.25,88.233,52.346,98.039,80.769,55.0,82.0,85.25 gemini-3.1-flash-lite-preview-high,97.0,65.217,71.831,91.0,23.767,27.0,35.0,60.0,89.216,81.021,61.95,46.548,40.0,65.867,88.0,67.3,79.35,42.885,98.039,65.385,25.0,82.0,25.25 gemini-3.5-flash-high,98.0,76.087,80.282,100.0,47.994,68.0,45.0,74.0,95.098,91.877,75.517,67.753,60.0,69.1,96.0,75.2,82.583,50.5,96.078,80.769,50.0,86.0,77.25 gemma-4-31b-it,100.0,58.696,61.972,92.5,29.896,26.316,20.0,64.0,87.255,82.175,69.467,47.513,65.0,61.3,92.0,71.683,67.867,46.385,100.0,51.923,35.0,74.0,29.75 glm-4.6,98.0,67.391,74.648,71.0,12.362,50.0,30.0,60.0,95.098,81.414,22.767,49.959,55.0,20.817,90.0,29.367,31.817,43.481,100.0,50.0,20.0,56.0,48.25 glm-4.6v,95.0,60.87,67.606,57.5,6.251,10.0,0.0,12.0,84.314,60.687,16.983,29.713,10.0,14.933,84.0,14.6,21.733,34.942,98.039,34.615,0.0,62.0,18.25 glm-4.7,97.0,67.391,78.873,70.5,18.149,35.0,25.0,62.0,92.157,79.918,31.183,53.179,65.0,36.817,82.0,37.85,36.783,47.365,100.0,76.923,35.0,72.0,18.0 glm-5,97.0,78.261,69.014,91.0,59.957,57.0,50.0,66.0,95.098,84.758,52.35,63.584,75.0,52.85,92.0,57.833,58.283,45.692,98.039,82.692,40.0,78.0,35.75 glm-5v-turbo,98.837,71.739,76.056,86.0,20.865,13.0,0.0,60.0,85.294,84.514,22.767,46.826,10.0,25.917,94.0,28.383,31.733,43.5,98.039,51.923,0.0,54.0,18.5 glm-5.1,97.0,76.087,74.648,79.0,53.866,60.0,55.0,68.0,94.118,88.459,65.883,60.337,65.0,68.567,94.0,69.65,69.717,49.538,86.275,78.846,45.0,76.0,49.25 gpt-5-mini,98.0,76.087,76.056,96.0,5.103,32.0,30.0,48.0,89.216,78.295,61.933,37.437,55.0,56.45,86.0,71.633,66.85,43.712,100.0,59.615,20.0,74.0,41.0 gpt-5-mini-high,98.0,67.391,69.014,98.667,21.681,57.0,50.0,62.0,91.176,82.626,61.917,41.895,70.0,60.367,88.0,73.5,65.3,43.904,100.0,61.538,20.0,86.0,61.75 gpt-5-mini-low,95.0,63.043,76.056,90.5,0.0,12.0,35.0,40.0,79.412,66.545,45.633,28.737,50.0,48.367,84.0,59.5,49.35,36.923,98.039,53.846,25.0,62.0,5.75 gpt-5-mini-minimal,76.0,69.565,71.831,48.5,0.478,6.0,25.0,26.0,56.863,51.259,15.533,25.866,30.0,22.867,62.0,19.583,25.267,29.923,94.118,32.692,10.0,60.0,8.5 gpt-5-nano,96.0,58.696,76.056,74.667,1.499,22.0,20.0,8.0,86.275,54.522,48.8,16.516,45.0,55.117,80.0,55.35,48.667,35.385,96.078,42.308,20.0,52.0,11.5 gpt-5-nano-high,96.0,54.348,70.423,81.167,5.809,31.0,25.0,14.0,90.196,56.429,50.183,17.358,35.0,58.75,94.0,55.233,58.633,34.212,90.196,46.154,10.0,42.0,7.0 gpt-5-nano-low,79.0,43.478,61.972,64.333,0.0,7.0,10.0,0.0,73.529,36.099,25.633,13.871,20.0,25.95,70.0,36.783,28.017,27.346,82.353,38.462,0.0,28.0,2.25 gpt-5-pro-2025-10-06,98.0,69.565,74.648,98.0,26.276,62.0,45.0,64.0,97.059,87.603,61.4,64.085,65.0,62.217,98.0,67.267,64.95,44.846,100.0,80.769,45.0,80.0,84.0 gpt-5.1-2025-11-13-high,97.0,71.739,73.239,97.333,64.684,67.0,50.0,66.0,95.098,88.5,58.333,60.448,65.0,60.133,92.0,76.117,61.033,44.135,100.0,76.923,45.0,80.0,80.25 gpt-5.1-2025-11-13-low,94.0,76.087,84.507,98.0,0.237,24.0,25.0,58.0,81.373,74.14,43.233,53.555,60.0,46.55,88.0,55.183,45.533,37.365,100.0,73.077,35.0,82.0,19.5 gpt-5.1-2025-11-13-medium,97.0,73.913,77.465,100.0,48.459,42.0,45.0,62.0,90.196,86.846,53.483,55.971,65.0,63.85,94.0,66.817,57.067,41.212,100.0,76.923,50.0,80.0,63.0 gpt-5.1-2025-11-13-nothinking,78.0,76.087,78.873,44.667,3.927,3.0,35.0,26.0,28.431,68.614,16.883,42.867,30.0,27.117,48.0,21.933,28.05,32.192,96.078,25.0,20.0,74.0,8.25 gpt-5.1-codex,95.0,71.739,71.831,96.5,44.644,45.0,45.0,64.0,96.078,82.244,61.0,41.937,65.0,60.317,98.0,67.55,64.683,41.519,96.078,76.923,50.0,70.0,89.0 gpt-5.1-codex-max,96.0,78.261,84.507,86.0,21.84,56.0,40.0,64.0,93.137,89.483,57.383,58.155,75.0,66.5,100.0,71.25,73.367,42.827,100.0,86.538,55.0,82.0,87.75 gpt-5.1-codex-max-high,90.0,78.261,83.099,90.0,65.12,61.0,45.0,64.0,96.078,85.787,77.95,57.441,65.0,65.45,98.0,65.967,72.167,45.25,100.0,84.615,50.0,82.0,88.0 gpt-5.1-codex-mini,98.0,65.217,74.648,97.0,8.528,32.0,25.0,58.0,94.118,80.904,45.467,30.039,65.0,59.783,94.0,68.383,62.45,40.577,100.0,53.846,30.0,62.0,53.0 gpt-5.2-2025-12-11-high,97.0,76.087,76.056,99.0,88.913,92.0,50.0,74.0,95.098,88.566,64.1,58.427,70.0,54.083,96.0,65.7,63.2,45.577,100.0,78.846,35.0,82.0,84.0 gpt-5.2-2025-12-11-low,97.0,71.739,76.056,97.5,16.119,61.0,35.0,66.0,94.118,86.739,46.117,47.042,65.0,59.9,100.0,55.833,56.333,42.096,100.0,76.923,50.0,66.0,42.0 gpt-5.2-2025-12-11-medium,97.0,69.565,74.648,97.5,66.166,88.0,50.0,72.0,94.118,89.158,58.817,51.33,65.0,53.367,96.0,62.917,55.0,44.942,100.0,82.692,40.0,76.0,86.0 gpt-5.2-2025-12-11-nothinking,77.0,78.261,74.648,48.833,5.485,7.0,35.0,54.0,67.647,81.356,21.567,45.067,50.0,25.183,74.0,30.25,31.8,37.558,100.0,32.692,35.0,56.0,10.5 gpt-5.2-codex,98.0,86.957,80.282,95.0,89.016,75.0,30.0,68.0,95.098,86.997,67.05,56.035,75.0,60.567,94.0,74.783,63.383,45.596,100.0,78.846,50.0,70.0,70.0 gpt-5.3-codex-high,98.0,76.087,80.282,100.0,39.699,73.0,50.0,70.0,93.137,87.206,64.85,64.26,65.0,61.083,98.0,74.75,60.817,48.365,100.0,84.615,50.0,76.0,68.0 gpt-5.3-codex-xhigh,97.0,76.087,78.873,98.0,6.383,66.0,60.0,68.0,93.137,86.661,70.9,57.54,75.0,65.8,100.0,73.733,74.933,42.654,100.0,57.692,65.0,82.0,60.0 gpt-5.3-instant,97.0,82.609,74.648,93.0,1.99,23.0,20.0,66.0,86.275,83.347,56.283,52.998,45.0,54.617,94.0,70.7,56.017,42.077,100.0,63.462,20.0,64.0,29.0 gpt-5.4-high,98.0,76.087,80.282,99.0,81.357,77.0,45.0,70.0,96.078,88.847,62.417,66.027,50.0,59.2,98.0,72.983,65.217,49.788,100.0,84.615,45.0,84.0,90.0 gpt-5.4-xhigh,98.0,80.435,74.648,100.0,86.19,93.0,65.0,68.0,94.118,91.474,63.433,65.901,85.0,70.0,98.0,79.017,68.417,51.75,100.0,88.462,60.0,82.0,98.0 gpt-5.4-mini,68.0,75.556,73.846,48.785,8.333,7.071,21.053,6.0,18.627,54.481,14.583,30.552,15.0,23.95,46.0,16.769,20.2,33.784,100.0,28.846,15.0,46.0,6.731 gpt-5.4-mini-high,97.0,71.111,70.769,94.792,59.291,22.222,36.842,68.0,91.176,85.996,54.41,40.484,60.0,61.117,86.0,49.116,60.5,48.51,100.0,59.615,20.0,62.0,65.064 gpt-5.4-mini-low,93.0,68.889,76.923,87.5,16.111,9.091,15.789,28.0,86.275,67.992,26.215,31.281,25.0,34.95,76.0,44.66,39.65,39.529,100.0,46.154,30.0,54.0,11.218 gpt-5.4-mini-medium,96.0,64.444,78.462,92.188,48.369,16.162,21.053,52.0,89.216,80.232,46.25,36.923,30.0,44.75,84.0,56.02,56.0,44.51,100.0,65.385,30.0,58.0,46.795 gpt-5.4-mini-xhigh,97.0,75.556,67.692,98.958,62.972,37.374,47.368,60.0,94.118,85.73,59.878,45.428,55.0,54.067,88.0,67.483,59.633,49.863,100.0,76.923,40.0,70.0,65.064 gpt-5.4-nano,62.0,62.222,61.538,25.694,1.549,9.091,26.316,0.0,32.353,40.589,20.799,14.352,35.0,16.4,44.0,17.568,11.183,17.863,98.039,25.0,20.0,46.0,0.641 gpt-5.4-nano-high,95.0,64.444,72.308,85.417,19.897,81.818,47.368,42.0,93.137,84.419,46.597,34.835,65.0,59.417,88.0,57.432,51.533,42.0,96.078,69.231,35.0,44.0,88.782 gpt-5.4-nano-low,94.0,71.111,67.692,73.09,0.101,21.212,36.842,28.0,78.431,66.981,35.556,17.586,60.0,36.233,70.0,36.395,40.517,35.725,96.078,55.769,15.0,46.0,14.744 gpt-5.4-nano-medium,97.0,66.667,73.846,85.417,5.874,61.616,42.105,52.0,91.176,82.591,42.674,29.695,60.0,50.367,80.0,53.639,57.1,42.235,98.039,59.615,20.0,38.0,66.346 gpt-5.4-nano-xhigh,98.0,68.889,75.385,94.271,54.301,82.828,47.368,64.0,95.098,89.155,66.649,39.133,65.0,62.75,82.0,67.653,71.767,50.588,98.039,80.769,35.0,54.0,97.436 gpt-5.5-high,98.0,84.783,80.282,100.0,86.013,100.0,30.0,72.0,95.098,91.179,74.367,76.282,25.0,69.9,98.0,70.85,72.65,52.423,100.0,78.846,35.0,86.0,96.0 gpt-5.5-xhigh,98.0,80.435,84.507,100.0,88.741,100.0,45.0,74.0,96.078,91.203,76.767,74.977,75.0,69.267,98.0,74.233,71.9,54.5,100.0,78.846,50.0,88.0,100.0 gpt-5.5-medium,98.0,82.609,74.648,100.0,78.23,85.0,5.0,76.0,96.078,0.0,63.6,70.711,35.0,58.733,98.0,67.2,73.417,52.731,100.0,78.846,10.0,86.0,96.25 gpt-oss-120b,92.0,50.0,70.423,79.167,2.87,32.0,20.0,8.0,92.157,59.338,44.833,18.605,20.0,55.5,80.0,56.467,44.367,29.231,84.314,48.077,10.0,48.0,20.75 grok-4-0709,86.0,67.391,78.873,98.667,51.072,65.0,20.0,74.0,94.118,86.975,19.817,60.497,50.0,25.867,96.0,38.783,31.833,41.019,98.039,61.538,20.0,70.0,85.0 grok-4-1-fast-non-reasoning,47.0,52.174,56.338,44.167,5.308,5.0,10.0,8.0,45.098,58.582,12.617,41.855,15.0,17.117,36.0,18.717,19.45,26.327,90.196,40.385,5.0,64.0,9.0 grok-4-1-fast-reasoning,95.0,67.391,71.831,99.333,29.941,65.0,25.0,70.0,96.078,78.802,23.733,47.648,50.0,25.8,94.0,32.283,31.0,38.538,88.235,61.538,20.0,76.0,95.25 grok-code-fast-1-0825,87.0,50.0,78.873,55.667,8.931,8.0,30.0,42.0,61.765,67.286,18.55,38.014,55.0,20.133,74.0,24.833,25.567,38.038,100.0,38.462,15.0,52.0,14.75 grok-4.20-beta-0309-non-reasoning,49.0,56.522,60.563,14.833,3.506,27.0,25.0,16.0,80.392,25.695,21.067,37.291,65.0,26.483,48.0,27.967,21.883,26.942,100.0,30.769,25.0,74.0,7.75 grok-4.20-beta-0309-reasoning,98.0,67.391,64.789,100.0,50.802,81.0,25.0,70.0,94.118,75.104,58.917,51.152,70.0,60.65,98.0,67.917,66.067,37.769,100.0,65.385,35.0,82.0,67.75 grok-4.3,97.0,65.217,74.648,94.0,28.029,63.0,45.0,66.0,95.098,82.258,64.383,48.74,70.0,57.6,98.0,70.6,58.417,41.25,98.039,61.538,35.0,78.0,57.75 kimi-k2-instruct,82.0,73.913,74.648,78.167,3.216,7.0,35.0,34.0,72.549,71.058,13.033,45.892,45.0,20.267,62.0,24.283,23.867,32.692,94.118,51.923,15.0,76.0,21.0 kimi-k2-thinking,96.0,63.043,71.831,88.0,15.719,52.0,40.0,58.0,96.078,80.305,63.383,43.36,55.0,50.867,92.0,67.417,66.467,45.058,96.078,57.692,20.0,68.0,46.25 kimi-k2.5-thinking,96.0,78.261,77.465,98.0,42.186,61.0,35.0,66.0,94.118,88.354,54.6,54.998,75.0,59.8,94.0,57.9,57.35,47.769,94.118,73.077,35.0,80.0,70.75 kimi-k2.6-thinking,97.0,78.261,78.873,89.333,51.268,54.0,50.0,70.0,96.078,90.027,61.633,58.091,80.0,61.417,94.0,66.883,67.5,46.096,98.039,75.0,45.0,78.0,78.5 minimax-m2.5,98.0,69.565,71.831,66.0,12.218,39.0,40.0,62.0,94.118,78.509,55.983,31.303,75.0,60.6,90.0,62.25,50.1,40.519,96.078,63.462,40.0,68.0,21.75 mimo-v2-pro,91.0,63.043,74.648,81.667,13.819,41.0,15.0,64.0,93.137,82.704,41.767,43.555,50.0,41.75,84.0,43.1,46.25,37.731,96.078,67.308,25.0,82.0,63.5 minimax-m2.7,97.0,47.826,61.972,88.333,28.563,49.0,45.0,60.0,92.157,84.012,58.05,33.996,75.0,58.5,96.0,68.867,59.05,42.404,98.039,76.923,30.0,78.0,66.25 nemotron-3-super-120b-a12b,87.0,39.13,69.014,24.0,0.0,6.0,0.0,60.784,23.582,29.133,10.23,36.083,46.0,36.75,42.133,7.827,58.824,7.692,56.0,0.25 qwen3-235b-a22b-instruct-2507,84.0,67.391,71.831,91.0,3.665,21.0,10.0,62.0,90.196,76.931,17.967,39.209,20.0,21.883,84.0,19.383,27.65,34.404,96.078,44.231,10.0,68.0,43.5 qwen3-235b-a22b-thinking-2507,86.0,71.739,66.197,83.0,7.24,40.0,5.0,60.0,86.275,81.304,37.617,47.571,10.0,38.117,86.0,39.217,47.617,49.308,100.0,53.846,5.0,78.0,37.75 qwen3-30b-a3b-thinking,91.0,45.652,52.113,68.167,1.575,22.0,0.0,10.0,84.314,64.075,17.933,31.228,5.0,20.383,82.0,25.05,21.067,37.115,96.078,38.462,0.0,64.0,16.25 qwen3-32b-thinking,87.0,63.043,69.014,61.0,3.043,25.0,0.0,30.0,84.314,73.446,15.75,37.627,10.0,20.133,86.0,14.517,20.683,38.538,98.039,55.769,0.0,68.0,21.25 qwen3-next-80b-a3b-instruct,97.0,67.391,69.014,89.0,12.099,26.0,5.0,52.0,86.275,71.437,16.4,40.013,20.0,18.4,90.0,17.55,24.4,45.096,92.157,44.231,5.0,70.0,32.75 qwen3-next-80b-a3b-thinking,98.0,56.522,64.789,70.167,14.423,38.0,15.0,58.0,88.235,72.805,33.167,40.768,10.0,39.067,96.0,48.317,45.617,46.327,100.0,46.154,0.0,58.0,32.5 qwen3.6-plus,97.0,76.087,80.282,90.5,67.042,62.0,45.0,70.0,93.137,82.762,60.617,52.468,75.0,51.567,98.0,56.867,64.317,44.654,98.039,67.308,45.0,82.0,68.0 qwen3.6-27b,93.0,71.739,71.831,77.167,70.395,52.0,35.0,62.0,92.157,82.317,49.433,42.746,70.0,50.733,100.0,60.133,52.617,42.846,98.039,65.385,45.0,70.0,53.75 qwen3.6-flash,98.0,60.87,69.014,76.333,39.277,44.0,45.0,60.0,93.137,80.372,49.55,43.115,65.0,41.35,90.0,46.933,50.967,43.115,94.118,50.0,30.0,70.0,51.75 qwen3.7-max,98.0,69.565,78.873,96.5,71.8,59.0,45.0,84.0,97.059,86.934,72.0,58.717,70.0,64.083,96.0,76.283,83.8,45.519,98.039,78.846,40.0,84.0,74.5