[ { "model": "xiaomi/mimo-v2-pro", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9545, "altered_accuracy": 0.4156666666666667, "pattern_override_rate": 0.326, "conditioned_override_rate": 0.2873015873015873, "conditioned_override_total": 181, "total_input_tokens": 1036449, "total_output_tokens": 5159785, "avg_input_tokens_per_riddle": 345.483, "avg_output_tokens_per_riddle": 1719.9283333333333, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 210, "conditioned_override_ci95": 0.054, "altered_accuracy_ci95": 0.035, "rank": 1, "rank_best": 1, "rank_worst": 7 }, { "model": "openai/gpt-oss-20b", "provider": "local", "quantization": "MXFP4", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.8318, "altered_accuracy": 0.355, "pattern_override_rate": 0.301, "conditioned_override_rate": 0.2896174863387978, "conditioned_override_total": 53, "total_input_tokens": 187076, "total_output_tokens": 3002659, "avg_input_tokens_per_riddle": 187.076, "avg_output_tokens_per_riddle": 3002.659, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 183, "conditioned_override_ci95": 0.0656, "altered_accuracy_ci95": 0.0357, "rank": 2, "rank_best": 1, "rank_worst": 8 }, { "model": "openai/gpt-5.4-mini", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9364, "altered_accuracy": 0.428, "pattern_override_rate": 0.323, "conditioned_override_rate": 0.3058252427184466, "conditioned_override_total": 63, "total_input_tokens": 95355, "total_output_tokens": 1858202, "avg_input_tokens_per_riddle": 95.355, "avg_output_tokens_per_riddle": 1858.202, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 206, "conditioned_override_ci95": 0.0631, "altered_accuracy_ci95": 0.0377, "rank": 3, "rank_best": 1, "rank_worst": 11 }, { "model": "minimaxai/minimax-m2.7", "provider": "together", "quantization": "FP4", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.8864, "altered_accuracy": 0.363, "pattern_override_rate": 0.321, "conditioned_override_rate": 0.3230769230769231, "conditioned_override_total": 63, "total_input_tokens": 131954, "total_output_tokens": 4991738, "avg_input_tokens_per_riddle": 131.954, "avg_output_tokens_per_riddle": 4991.738, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 195, "conditioned_override_ci95": 0.0667, "altered_accuracy_ci95": 0.0363, "rank": 4, "rank_best": 1, "rank_worst": 12 }, { "model": "zai-org/glm-5.1", "provider": "together", "quantization": "FP4", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9682, "altered_accuracy": 0.372, "pattern_override_rate": 0.343, "conditioned_override_rate": 0.323943661971831, "conditioned_override_total": 69, "total_input_tokens": 95936, "total_output_tokens": 4030631, "avg_input_tokens_per_riddle": 95.936, "avg_output_tokens_per_riddle": 4030.631, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 213, "conditioned_override_ci95": 0.0634, "altered_accuracy_ci95": 0.0373, "rank": 5, "rank_best": 1, "rank_worst": 11 }, { "model": "openai/gpt-oss-120b", "provider": "together", "quantization": "MXFP4", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.8909, "altered_accuracy": 0.368, "pattern_override_rate": 0.342, "conditioned_override_rate": 0.33163265306122447, "conditioned_override_total": 65, "total_input_tokens": 169996, "total_output_tokens": 750930, "avg_input_tokens_per_riddle": 169.996, "avg_output_tokens_per_riddle": 750.93, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 196, "conditioned_override_ci95": 0.0663, "altered_accuracy_ci95": 0.035, "rank": 6, "rank_best": 1, "rank_worst": 10 }, { "model": "xiaomi/mimo-v2-omni", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9364, "altered_accuracy": 0.38666666666666666, "pattern_override_rate": 0.327, "conditioned_override_rate": 0.3365695792880259, "conditioned_override_total": 208, "total_input_tokens": 1040060, "total_output_tokens": 4893327, "avg_input_tokens_per_riddle": 346.68666666666667, "avg_output_tokens_per_riddle": 1631.109, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 206, "conditioned_override_ci95": 0.0566, "altered_accuracy_ci95": 0.0342, "rank": 7, "rank_best": 2, "rank_worst": 10 }, { "model": "zai-org/glm-5", "provider": "together", "quantization": "FP4", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9591, "altered_accuracy": 0.364, "pattern_override_rate": 0.38, "conditioned_override_rate": 0.3459715639810427, "conditioned_override_total": 73, "total_input_tokens": 95936, "total_output_tokens": 3539312, "avg_input_tokens_per_riddle": 95.936, "avg_output_tokens_per_riddle": 3539.312, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 211, "conditioned_override_ci95": 0.064, "altered_accuracy_ci95": 0.0367, "rank": 8, "rank_best": 3, "rank_worst": 12 }, { "model": "mistralai/mistral-small-2603", "provider": "mistral", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.8824, "altered_accuracy": 0.355, "pattern_override_rate": 0.323, "conditioned_override_rate": 0.3487179487179487, "conditioned_override_total": 68, "total_input_tokens": 109514, "total_output_tokens": 1924520, "avg_input_tokens_per_riddle": 109.514, "avg_output_tokens_per_riddle": 1924.52, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 195, "conditioned_override_ci95": 0.0664, "altered_accuracy_ci95": 0.0351, "rank": 9, "rank_best": 1, "rank_worst": 12 }, { "model": "google/gemma-4-31b-it", "provider": "together", "quantization": "FP8", "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9545, "altered_accuracy": 0.402, "pattern_override_rate": 0.342, "conditioned_override_rate": 0.3619047619047619, "conditioned_override_total": 76, "total_input_tokens": 108264, "total_output_tokens": 1597477, "avg_input_tokens_per_riddle": 108.264, "avg_output_tokens_per_riddle": 1597.477, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 210, "conditioned_override_ci95": 0.0643, "altered_accuracy_ci95": 0.0397, "rank": 10, "rank_best": 3, "rank_worst": 13 }, { "model": "openai/gpt-5.4-mini", "provider": "nous", "quantization": null, "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.8727, "altered_accuracy": 0.271, "pattern_override_rate": 0.401, "conditioned_override_rate": 0.4010416666666667, "conditioned_override_total": 77, "total_input_tokens": 95576, "total_output_tokens": 13929, "avg_input_tokens_per_riddle": 95.576, "avg_output_tokens_per_riddle": 13.929, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 192, "conditioned_override_ci95": 0.0703, "altered_accuracy_ci95": 0.0334, "rank": 11, "rank_best": 7, "rank_worst": 17 }, { "model": "qwen/qwen3.5-27b", "provider": "local", "quantization": "UD-Q4_K_XL", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.8409, "altered_accuracy": 0.27599999999999997, "pattern_override_rate": 0.379, "conditioned_override_rate": 0.42342342342342343, "conditioned_override_total": 235, "total_input_tokens": 283593, "total_output_tokens": 26555, "avg_input_tokens_per_riddle": 94.531, "avg_output_tokens_per_riddle": 8.851666666666667, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 185, "conditioned_override_ci95": 0.0649, "altered_accuracy_ci95": 0.0296, "rank": 12, "rank_best": 10, "rank_worst": 16 }, { "model": "moonshotai/kimi-k2.5", "provider": "together", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9682, "altered_accuracy": 0.35, "pattern_override_rate": 0.428, "conditioned_override_rate": 0.431924882629108, "conditioned_override_total": 92, "total_input_tokens": 120059, "total_output_tokens": 2697962, "avg_input_tokens_per_riddle": 120.059, "avg_output_tokens_per_riddle": 2697.962, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 213, "conditioned_override_ci95": 0.0657, "altered_accuracy_ci95": 0.0342, "rank": 13, "rank_best": 11, "rank_worst": 21 }, { "model": "moonshotai/kimi-k2.6", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9682, "altered_accuracy": 0.352, "pattern_override_rate": 0.453, "conditioned_override_rate": 0.4413145539906103, "conditioned_override_total": 94, "total_input_tokens": 95997, "total_output_tokens": 2861383, "avg_input_tokens_per_riddle": 95.80538922155688, "avg_output_tokens_per_riddle": 2855.6716566866266, "avg_samples_per_riddle": 1.002, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 213, "conditioned_override_ci95": 0.0657, "altered_accuracy_ci95": 0.0362, "rank": 14, "rank_best": 11, "rank_worst": 21 }, { "model": "google/gemma-4-31b-it", "provider": "local", "quantization": "UD-Q6_K_XL", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.9455, "altered_accuracy": 0.321, "pattern_override_rate": 0.431, "conditioned_override_rate": 0.46153846153846156, "conditioned_override_total": 288, "total_input_tokens": 280760, "total_output_tokens": 52127, "avg_input_tokens_per_riddle": 93.58666666666667, "avg_output_tokens_per_riddle": 17.375666666666667, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 208, "conditioned_override_ci95": 0.0657, "altered_accuracy_ci95": 0.0337, "rank": 15, "rank_best": 12, "rank_worst": 21 }, { "model": "qwen/qwen3.6-27b", "provider": "local", "quantization": "UD-Q6_K_XL", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.8227, "altered_accuracy": 0.2593333333333333, "pattern_override_rate": 0.421, "conditioned_override_rate": 0.47513812154696133, "conditioned_override_total": 258, "total_input_tokens": 283593, "total_output_tokens": 25550, "avg_input_tokens_per_riddle": 94.531, "avg_output_tokens_per_riddle": 8.516666666666667, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 181, "conditioned_override_ci95": 0.0663, "altered_accuracy_ci95": 0.0306, "rank": 16, "rank_best": 12, "rank_worst": 22 }, { "model": "anthropic/claude-sonnet-4.6", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9773, "altered_accuracy": 0.331, "pattern_override_rate": 0.457, "conditioned_override_rate": 0.48372093023255813, "conditioned_override_total": 104, "total_input_tokens": 146204, "total_output_tokens": 164677, "avg_input_tokens_per_riddle": 146.204, "avg_output_tokens_per_riddle": 164.677, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 215, "conditioned_override_ci95": 0.0674, "altered_accuracy_ci95": 0.0367, "rank": 17, "rank_best": 12, "rank_worst": 23 }, { "model": "google/gemma-4-26b-a4b-it", "provider": "local", "quantization": "UD-Q6_K_XL", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.8591, "altered_accuracy": 0.30633333333333335, "pattern_override_rate": 0.43, "conditioned_override_rate": 0.48853615520282184, "conditioned_override_total": 277, "total_input_tokens": 280760, "total_output_tokens": 25829, "avg_input_tokens_per_riddle": 93.58666666666667, "avg_output_tokens_per_riddle": 8.609666666666667, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 189, "conditioned_override_ci95": 0.067, "altered_accuracy_ci95": 0.0328, "rank": 18, "rank_best": 13, "rank_worst": 22 }, { "model": "qwen/qwen3.6-35b-a3b", "provider": "local", "quantization": "UD-Q6_K_XL", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.7636, "altered_accuracy": 0.22266666666666665, "pattern_override_rate": 0.422, "conditioned_override_rate": 0.5238095238095238, "conditioned_override_total": 264, "total_input_tokens": 283593, "total_output_tokens": 26382, "avg_input_tokens_per_riddle": 94.531, "avg_output_tokens_per_riddle": 8.794, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 168, "conditioned_override_ci95": 0.0675, "altered_accuracy_ci95": 0.0284, "rank": 19, "rank_best": 14, "rank_worst": 23 }, { "model": "mistralai/mistral-small-2603", "provider": "mistral", "quantization": null, "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.8778, "altered_accuracy": 0.203, "pattern_override_rate": 0.461, "conditioned_override_rate": 0.5257731958762887, "conditioned_override_total": 102, "total_input_tokens": 111086, "total_output_tokens": 9763, "avg_input_tokens_per_riddle": 111.086, "avg_output_tokens_per_riddle": 9.763, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 194, "conditioned_override_ci95": 0.0696, "altered_accuracy_ci95": 0.0276, "rank": 20, "rank_best": 14, "rank_worst": 23 }, { "model": "anthropic/claude-opus-4.7", "provider": "nous", "quantization": null, "reasoning_enabled": true, "reasoning_effort": "high", "original_accuracy": 0.9727, "altered_accuracy": 0.279, "pattern_override_rate": 0.497, "conditioned_override_rate": 0.5280373831775701, "conditioned_override_total": 113, "total_input_tokens": 139993, "total_output_tokens": 39862, "avg_input_tokens_per_riddle": 139.993, "avg_output_tokens_per_riddle": 39.862, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 214, "conditioned_override_ci95": 0.0678, "altered_accuracy_ci95": 0.0332, "rank": 21, "rank_best": 16, "rank_worst": 23 }, { "model": "liquidai/lfm2-24b-a2b", "provider": "local", "quantization": "Q8_0", "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.4909, "altered_accuracy": 0.209, "pattern_override_rate": 0.254, "conditioned_override_rate": 0.5308641975308642, "conditioned_override_total": 172, "total_input_tokens": 282047, "total_output_tokens": 29243, "avg_input_tokens_per_riddle": 94.01566666666666, "avg_output_tokens_per_riddle": 9.747666666666667, "avg_samples_per_riddle": 3, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 108, "conditioned_override_ci95": 0.0926, "altered_accuracy_ci95": 0.0273, "rank": 22, "rank_best": 5, "rank_worst": 20 }, { "model": "moonshotai/kimi-k2.5", "provider": "together", "quantization": null, "reasoning_enabled": false, "reasoning_effort": null, "original_accuracy": 0.9727, "altered_accuracy": 0.23, "pattern_override_rate": 0.548, "conditioned_override_rate": 0.5373831775700935, "conditioned_override_total": 115, "total_input_tokens": 121279, "total_output_tokens": 19462, "avg_input_tokens_per_riddle": 121.279, "avg_output_tokens_per_riddle": 19.462, "avg_samples_per_riddle": 1, "unique_altered_riddles": 1000, "conditioned_unique_altered_riddles": 214, "conditioned_override_ci95": 0.0654, "altered_accuracy_ci95": 0.0305, "rank": 23, "rank_best": 19, "rank_worst": 23 } ]