{ "Athene-70B": { "model": "Athene-70B", "score": 7.970645792563601, "adjusted_score": 5.9412915851272015, "task_macro_score": 5.953736733195851, "adjusted_task_macro_score": 5.953736733195851, "task_categorized_scores": { "Creative Tasks": 6.036175710594314, "Coding & Debugging": 5.895734597156398, "Planning & Reasoning": 6.095952023988005, "Information/Advice seeking": 6.079207920792079, "Math & Data Analysis": 5.713147410358566 }, "total": 1022, "avg_len": 3175.1438356164385 }, "gpt-4o-2024-05-13": { "model": "gpt-4o-2024-05-13", "score": 7.940371456500489, "adjusted_score": 5.880742913000978, "task_macro_score": 5.929817880351956, "adjusted_task_macro_score": 5.929817880351956, "task_categorized_scores": { "Creative Tasks": 5.912144702842378, "Coding & Debugging": 6.0473933649289116, "Planning & Reasoning": 6.020958083832337, "Information/Advice seeking": 5.861386138613861, "Math & Data Analysis": 5.729083665338646 }, "total": 1023, "avg_len": 3723.516129032258 }, "gpt-4o-mini-2024-07-18": { "model": "gpt-4o-mini-2024-07-18", "score": 7.86328125, "adjusted_score": 5.7265625, "task_macro_score": 5.713689403451416, "adjusted_task_macro_score": 5.713689403451416, "task_categorized_scores": { "Creative Tasks": 6.00516795865633, "Coding & Debugging": 5.716981132075471, "Planning & Reasoning": 5.823617339312406, "Information/Advice seeking": 5.742574257425742, "Math & Data Analysis": 5.404761904761905 }, "total": 1024, "avg_len": 3648.126953125 }, "gpt-4-turbo-2024-04-09": { "model": "gpt-4-turbo-2024-04-09", "score": 7.804496578690127, "adjusted_score": 5.6089931573802545, "task_macro_score": 5.522122481039269, "adjusted_task_macro_score": 5.522122481039269, "task_categorized_scores": { "Creative Tasks": 5.865633074935401, "Coding & Debugging": 5.507109004739336, "Planning & Reasoning": 5.6203288490284, "Information/Advice seeking": 5.717821782178218, "Math & Data Analysis": 5.099601593625499 }, "total": 1023, "avg_len": 3093.1700879765394 }, "Mistral-Large-2": { "model": "Mistral-Large-2", "score": 7.7900390625, "adjusted_score": 5.580078125, "task_macro_score": 5.556833516154802, "adjusted_task_macro_score": 5.556833516154802, "task_categorized_scores": { "Planning & Reasoning": 5.721556886227544, "Information/Advice seeking": 5.737623762376238, "Coding & Debugging": 5.383886255924171, "Math & Data Analysis": 5.266932270916335, "Creative Tasks": 5.8860103626943 }, "total": 1024, "avg_len": 3503.6262230919765 }, "yi-large-preview": { "model": "yi-large-preview", "score": 7.741935483870968, "adjusted_score": 5.483870967741936, "task_macro_score": 5.529462523202478, "adjusted_task_macro_score": 5.529462523202478, "task_categorized_scores": { "Planning & Reasoning": 5.66066066066066, "Information/Advice seeking": 5.772277227722773, "Coding & Debugging": 5.428571428571429, "Math & Data Analysis": 5.192, "Creative Tasks": 5.7643979057591626 }, "total": 1023, "avg_len": 3512.678149606299 }, "claude-3-5-sonnet-20240620": { "model": "claude-3-5-sonnet-20240620", "score": 7.7265625, "adjusted_score": 5.453125, "task_macro_score": 5.469508456618439, "adjusted_task_macro_score": 5.469508456618439, "task_categorized_scores": { "Creative Tasks": 5.560723514211887, "Coding & Debugging": 5.650943396226415, "Planning & Reasoning": 5.563527653213752, "Information/Advice seeking": 5.554455445544555, "Math & Data Analysis": 5.015873015873016 }, "total": 1024, "avg_len": 2911.845703125 }, "gemma-2-9b-it-DPO": { "model": "gemma-2-9b-it-DPO", "score": 7.712890625, "adjusted_score": 5.42578125, "task_macro_score": 5.322295446230848, "adjusted_task_macro_score": 5.322295446230848, "task_categorized_scores": { "Planning & Reasoning": 5.547226386806596, "Information/Advice seeking": 5.821782178217822, "Coding & Debugging": 5.052132701421801, "Math & Data Analysis": 4.712, "Creative Tasks": 5.9067357512953365 }, "total": 1024, "avg_len": 3982.628795298727 }, "gemma-2-9b-it-SimPO": { "model": "gemma-2-9b-it-SimPO", "score": 7.703812316715543, "adjusted_score": 5.407624633431086, "task_macro_score": 5.327923406955029, "adjusted_task_macro_score": 5.327923406955029, "task_categorized_scores": { "Planning & Reasoning": 5.564564564564565, "Information/Advice seeking": 5.648514851485148, "Coding & Debugging": 5.085714285714285, "Math & Data Analysis": 4.859437751004016, "Creative Tasks": 5.797927461139896 }, "total": 1023, "avg_len": 4277.667647058824 }, "deepseek-v2-chat-0628": { "model": "deepseek-v2-chat-0628", "score": 7.6904296875, "adjusted_score": 5.380859375, "task_macro_score": 5.399428041165569, "adjusted_task_macro_score": 5.399428041165569, "task_categorized_scores": { "Creative Tasks": 5.643410852713178, "Coding & Debugging": 5.5, "Planning & Reasoning": 5.482810164424514, "Information/Advice seeking": 5.272277227722773, "Math & Data Analysis": 5.142857142857142 }, "total": 1024, "avg_len": 3252.376953125 }, "gpt-4-0125-preview": { "model": "gpt-4-0125-preview", "score": 7.6640625, "adjusted_score": 5.328125, "task_macro_score": 5.227753918256898, "adjusted_task_macro_score": 5.227753918256898, "task_categorized_scores": { "Creative Tasks": 5.757105943152455, "Coding & Debugging": 5.2924528301886795, "Planning & Reasoning": 5.345291479820627, "Information/Advice seeking": 5.435643564356436, "Math & Data Analysis": 4.579365079365079 }, "total": 1024, "avg_len": 3335.638671875 }, "claude-3-opus-20240229": { "model": "claude-3-opus-20240229", "score": 7.60546875, "adjusted_score": 5.2109375, "task_macro_score": 5.171404760028754, "adjusted_task_macro_score": 5.171404760028754, "task_categorized_scores": { "Creative Tasks": 5.302325581395349, "Coding & Debugging": 5.330188679245284, "Planning & Reasoning": 5.252615844544096, "Information/Advice seeking": 5.346534653465346, "Math & Data Analysis": 4.674603174603174 }, "total": 1024, "avg_len": 2685.9794921875 }, "deepseekv2-chat": { "model": "deepseekv2-chat", "score": 7.502443792766374, "adjusted_score": 5.004887585532748, "task_macro_score": 4.821191935259587, "adjusted_task_macro_score": 4.821191935259587, "task_categorized_scores": { "Creative Tasks": 5.359173126614987, "Coding & Debugging": 4.443396226415095, "Planning & Reasoning": 5.062874251497005, "Information/Advice seeking": 5.181141439205955, "Math & Data Analysis": 4.4523809523809526 }, "total": 1023, "avg_len": 2896.965786901271 }, "Meta-Llama-3-70B-Instruct": { "model": "Meta-Llama-3-70B-Instruct", "score": 7.478983382209188, "adjusted_score": 4.9579667644183765, "task_macro_score": 4.777080449630633, "adjusted_task_macro_score": 4.777080449630633, "task_categorized_scores": { "Creative Tasks": 5.430051813471502, "Coding & Debugging": 4.471698113207546, "Planning & Reasoning": 5.0074738415545585, "Information/Advice seeking": 5.227722772277227, "Math & Data Analysis": 4.206349206349206 }, "total": 1023, "avg_len": 3046.6383186705766 }, "gemma-2-27b-it@together": { "model": "gemma-2-27b-it@together", "score": 7.4697265625, "adjusted_score": 4.939453125, "task_macro_score": 4.854019672452688, "adjusted_task_macro_score": 4.854019672452688, "task_categorized_scores": { "Planning & Reasoning": 5.055472263868065, "Information/Advice seeking": 5.049504950495049, "Coding & Debugging": 4.701421800947868, "Math & Data Analysis": 4.3919999999999995, "Creative Tasks": 5.362694300518134 }, "total": 1024, "avg_len": 2924.5455435847207 }, "yi-large": { "model": "yi-large", "score": 7.446725317693059, "adjusted_score": 4.8934506353861185, "task_macro_score": 4.892726960200772, "adjusted_task_macro_score": 4.892726960200772, "task_categorized_scores": { "Planning & Reasoning": 5.133834586466165, "Information/Advice seeking": 5.096774193548388, "Coding & Debugging": 4.771428571428572, "Math & Data Analysis": 4.446215139442231, "Creative Tasks": 5.180156657963446 }, "total": 1023, "avg_len": 3095.335952848723 }, "deepseek-coder-v2": { "model": "deepseek-coder-v2", "score": 7.4447702834799605, "adjusted_score": 4.889540566959921, "task_macro_score": 4.739521235239142, "adjusted_task_macro_score": 4.739521235239142, "task_categorized_scores": { "Creative Tasks": 5.449350649350649, "Coding & Debugging": 4.485714285714286, "Planning & Reasoning": 4.924698795180722, "Information/Advice seeking": 5.154228855721392, "Math & Data Analysis": 4.159362549800797 }, "total": 1023, "avg_len": 2795.3091265947005 }, "nemotron-4-340b-instruct": { "model": "nemotron-4-340b-instruct", "score": 7.4423828125, "adjusted_score": 4.884765625, "task_macro_score": 4.767250981186394, "adjusted_task_macro_score": 4.767250981186394, "task_categorized_scores": { "Planning & Reasoning": 4.912912912912914, "Information/Advice seeking": 5.300248138957816, "Coding & Debugging": 4.625592417061611, "Math & Data Analysis": 4.0803212851405615, "Creative Tasks": 5.33160621761658 }, "total": 1024, "avg_len": 2754.0098039215686 }, "gemini-1.5-pro": { "model": "gemini-1.5-pro", "score": 7.369140625, "adjusted_score": 4.73828125, "task_macro_score": 5.295184246265066, "adjusted_task_macro_score": 5.295184246265066, "task_categorized_scores": { "Planning & Reasoning": 5.373271889400922, "Information/Advice seeking": 5.222506393861893, "Coding & Debugging": 5.522388059701493, "Math & Data Analysis": 4.859437751004016, "Creative Tasks": 5.512465373961218 }, "total": 1024, "avg_len": 3247.9673135852913 }, "Yi-1.5-34B-Chat": { "model": "Yi-1.5-34B-Chat", "score": 7.367546432062561, "adjusted_score": 4.7350928641251215, "task_macro_score": 4.561346347759096, "adjusted_task_macro_score": 4.561346347759096, "task_categorized_scores": { "Planning & Reasoning": 4.8108108108108105, "Information/Advice seeking": 5.029702970297029, "Coding & Debugging": 4.208530805687204, "Math & Data Analysis": 3.9437751004016057, "Creative Tasks": 5.352331606217616 }, "total": 1023, "avg_len": 3523.557843137255 }, "Mistral-Nemo-Instruct-2407": { "model": "Mistral-Nemo-Instruct-2407", "score": 7.343108504398827, "adjusted_score": 4.686217008797653, "task_macro_score": 4.437513167010813, "adjusted_task_macro_score": 4.437513167010813, "task_categorized_scores": { "Creative Tasks": 5.457364341085272, "Coding & Debugging": 3.971563981042655, "Planning & Reasoning": 4.741405082212257, "Information/Advice seeking": 5.193069306930694, "Math & Data Analysis": 3.5634920634920633 }, "total": 1023, "avg_len": 3318.2130987292276 }, "Qwen2-72B-Instruct": { "model": "Qwen2-72B-Instruct", "score": 7.3203125, "adjusted_score": 4.640625, "task_macro_score": 4.44976912962341, "adjusted_task_macro_score": 4.44976912962341, "task_categorized_scores": { "Creative Tasks": 4.992248062015504, "Coding & Debugging": 3.981132075471699, "Planning & Reasoning": 4.684603886397609, "Information/Advice seeking": 4.950495049504951, "Math & Data Analysis": 4.095238095238095 }, "total": 1024, "avg_len": 2856.4482421875 }, "gemma-2-9b-it": { "model": "gemma-2-9b-it", "score": 7.268101761252447, "adjusted_score": 4.536203522504893, "task_macro_score": 4.2696193124381026, "adjusted_task_macro_score": 4.2696193124381026, "task_categorized_scores": { "Creative Tasks": 5.10077519379845, "Coding & Debugging": 3.666666666666666, "Planning & Reasoning": 4.665667166416792, "Information/Advice seeking": 4.896039603960396, "Math & Data Analysis": 3.6428571428571423 }, "total": 1022, "avg_len": 2802.8923679060667 }, "claude-3-sonnet-20240229": { "model": "claude-3-sonnet-20240229", "score": 7.262230919765166, "adjusted_score": 4.524461839530332, "task_macro_score": 4.548145776375293, "adjusted_task_macro_score": 4.548145776375293, "task_categorized_scores": { "Creative Tasks": 4.630490956072352, "Coding & Debugging": 4.609523809523809, "Planning & Reasoning": 4.742514970059879, "Information/Advice seeking": 4.7128712871287135, "Math & Data Analysis": 4.063745019920319 }, "total": 1022, "avg_len": 2670.243639921722 }, "gemini-1.5-flash": { "model": "gemini-1.5-flash", "score": 7.2074363992172215, "adjusted_score": 4.414872798434443, "task_macro_score": 4.885062170599165, "adjusted_task_macro_score": 4.885062170599165, "task_categorized_scores": { "Planning & Reasoning": 5.078582434514638, "Information/Advice seeking": 4.866666666666667, "Coding & Debugging": 4.872549019607844, "Math & Data Analysis": 4.53225806451613, "Creative Tasks": 5.165745856353592 }, "total": 1022, "avg_len": 3654.3993871297243 }, "Qwen1.5-72B-Chat-greedy": { "model": "Qwen1.5-72B-Chat-greedy", "score": 7.173359451518119, "adjusted_score": 4.346718903036239, "task_macro_score": 3.992771366582465, "adjusted_task_macro_score": 3.992771366582465, "task_categorized_scores": { "Creative Tasks": 5.036269430051814, "Coding & Debugging": 3.5355450236966828, "Planning & Reasoning": 4.345345345345345, "Information/Advice seeking": 4.821782178217822, "Math & Data Analysis": 2.9800796812748995 }, "total": 1021, "avg_len": 2392.364348677767 }, "deepseek-v2-coder-0628": { "model": "deepseek-v2-coder-0628", "score": 7.171875, "adjusted_score": 4.34375, "task_macro_score": 4.566459211926647, "adjusted_task_macro_score": 4.566459211926647, "task_categorized_scores": { "Creative Tasks": 4.077519379844961, "Coding & Debugging": 4.886792452830189, "Planning & Reasoning": 4.7174887892376685, "Information/Advice seeking": 4.0049504950495045, "Math & Data Analysis": 4.642857142857142 }, "total": 1024, "avg_len": 2580.181640625 }, "Llama-3-8B-Magpie-Align-v0.1": { "model": "Llama-3-8B-Magpie-Align-v0.1", "score": 7.1223091976516635, "adjusted_score": 4.244618395303327, "task_macro_score": 3.9290196827463255, "adjusted_task_macro_score": 3.9290196827463255, "task_categorized_scores": { "Creative Tasks": 4.919896640826874, "Coding & Debugging": 3.374407582938389, "Planning & Reasoning": 4.27245508982036, "Information/Advice seeking": 4.891089108910892, "Math & Data Analysis": 2.976000000000001 }, "total": 1022, "avg_len": 3107.77397260274 }, "mistral-large-2402": { "model": "mistral-large-2402", "score": 7.114369501466276, "adjusted_score": 4.228739002932551, "task_macro_score": 3.889367833445423, "adjusted_task_macro_score": 3.889367833445423, "task_categorized_scores": { "Creative Tasks": 4.966408268733851, "Coding & Debugging": 3.374407582938389, "Planning & Reasoning": 4.179910044977511, "Information/Advice seeking": 4.613861386138614, "Math & Data Analysis": 3.087999999999999 }, "total": 1023, "avg_len": 2514.9814090019568 }, "command-r-plus": { "model": "command-r-plus", "score": 7.078277886497065, "adjusted_score": 4.15655577299413, "task_macro_score": 3.676236856767293, "adjusted_task_macro_score": 3.676236856767293, "task_categorized_scores": { "Creative Tasks": 5.2558139534883725, "Coding & Debugging": 2.843601895734597, "Planning & Reasoning": 4.194902548725636, "Information/Advice seeking": 4.915841584158416, "Math & Data Analysis": 2.3492063492063497 }, "total": 1022, "avg_len": 3293.812133072407 }, "Llama-3-Instruct-8B-SimPO-v0.2": { "model": "Llama-3-Instruct-8B-SimPO-v0.2", "score": 7.075268817204301, "adjusted_score": 4.150537634408602, "task_macro_score": 3.7155419825936797, "adjusted_task_macro_score": 3.7155419825936797, "task_categorized_scores": { "Creative Tasks": 5.183462532299741, "Coding & Debugging": 3.150943396226415, "Planning & Reasoning": 4.071856287425149, "Information/Advice seeking": 4.7871287128712865, "Math & Data Analysis": 2.438247011952191 }, "total": 1023, "avg_len": 2533.764418377322 }, "Llama-3-Instruct-8B-SimPO": { "model": "Llama-3-Instruct-8B-SimPO", "score": 7.058651026392962, "adjusted_score": 4.117302052785924, "task_macro_score": 3.7049721402304923, "adjusted_task_macro_score": 3.7049721402304923, "task_categorized_scores": { "Creative Tasks": 5.064599483204134, "Coding & Debugging": 3.1753554502369674, "Planning & Reasoning": 4.086696562032884, "Information/Advice seeking": 4.7871287128712865, "Math & Data Analysis": 2.3984063745019917 }, "total": 1023, "avg_len": 2541.9257086999023 }, "glm-4-9b-chat": { "model": "glm-4-9b-chat", "score": 7.058651026392962, "adjusted_score": 4.117302052785924, "task_macro_score": 3.909896797431742, "adjusted_task_macro_score": 3.909896797431742, "task_categorized_scores": { "Creative Tasks": 4.775193798449612, "Coding & Debugging": 3.537735849056604, "Planning & Reasoning": 4.248502994011975, "Information/Advice seeking": 4.628712871287128, "Math & Data Analysis": 2.9800796812748995 }, "total": 1023, "avg_len": 3692.043010752688 }, "reka-core-20240501": { "model": "reka-core-20240501", "score": 7.0517578125, "adjusted_score": 4.103515625, "task_macro_score": 4.590279465292558, "adjusted_task_macro_score": 4.590279465292558, "task_categorized_scores": { "Planning & Reasoning": 4.800632911392405, "Information/Advice seeking": 5.225464190981432, "Coding & Debugging": 4.060301507537689, "Math & Data Analysis": 4.034188034188034, "Creative Tasks": 5.548746518105849 }, "total": 1024, "avg_len": 2592.589397089397 }, "claude-3-haiku-20240307": { "model": "claude-3-haiku-20240307", "score": 7.0126953125, "adjusted_score": 4.025390625, "task_macro_score": 3.8893606666167266, "adjusted_task_macro_score": 3.8893606666167266, "task_categorized_scores": { "Creative Tasks": 4.294573643410853, "Coding & Debugging": 3.69811320754717, "Planning & Reasoning": 4.128550074738415, "Information/Advice seeking": 4.534653465346535, "Math & Data Analysis": 3.1428571428571423 }, "total": 1024, "avg_len": 2601.029296875 }, "SELM-Llama-3-8B-Instruct-iter-3": { "model": "SELM-Llama-3-8B-Instruct-iter-3", "score": 6.9980392156862745, "adjusted_score": 3.996078431372549, "task_macro_score": 3.525906077680738, "adjusted_task_macro_score": 3.525906077680738, "task_categorized_scores": { "Creative Tasks": 5.105943152454781, "Coding & Debugging": 2.7333333333333325, "Planning & Reasoning": 3.9789789789789793, "Information/Advice seeking": 4.605459057071961, "Math & Data Analysis": 2.3505976095617527 }, "total": 1020, "avg_len": 2913.1470588235293 }, "Yi-1.5-9B-Chat": { "model": "Yi-1.5-9B-Chat", "score": 6.992179863147605, "adjusted_score": 3.98435972629521, "task_macro_score": 3.8665353515172316, "adjusted_task_macro_score": 3.8665353515172316, "task_categorized_scores": { "Planning & Reasoning": 4.237237237237236, "Information/Advice seeking": 4.262376237623762, "Coding & Debugging": 3.4976303317535553, "Math & Data Analysis": 3.2208835341365454, "Creative Tasks": 4.5595854922279795 }, "total": 1023, "avg_len": 3468.23431372549 }, "Llama-3-Instruct-8B-SimPO-ExPO": { "model": "Llama-3-Instruct-8B-SimPO-ExPO", "score": 6.98435972629521, "adjusted_score": 3.9687194525904204, "task_macro_score": 3.501502977266739, "adjusted_task_macro_score": 3.501502977266739, "task_categorized_scores": { "Creative Tasks": 4.9147286821705425, "Coding & Debugging": 2.8584905660377355, "Planning & Reasoning": 3.9461077844311383, "Information/Advice seeking": 4.732673267326733, "Math & Data Analysis": 2.1195219123505975 }, "total": 1023, "avg_len": 2480.6490713587486 }, "dbrx-instruct@together": { "model": "dbrx-instruct@together", "score": 6.777126099706745, "adjusted_score": 3.55425219941349, "task_macro_score": 3.2598891595850845, "adjusted_task_macro_score": 3.2598891595850845, "task_categorized_scores": { "Creative Tasks": 4.232558139534884, "Coding & Debugging": 2.644549763033176, "Planning & Reasoning": 3.6227544910179645, "Information/Advice seeking": 4.108910891089108, "Math & Data Analysis": 2.4523809523809526 }, "total": 1023, "avg_len": 2576.5190615835777 }, "command-r": { "model": "command-r", "score": 6.7529296875, "adjusted_score": 3.505859375, "task_macro_score": 2.9533143228506247, "adjusted_task_macro_score": 2.9533143228506247, "task_categorized_scores": { "Creative Tasks": 4.7441860465116275, "Coding & Debugging": 1.933962264150944, "Planning & Reasoning": 3.461883408071749, "Information/Advice seeking": 4.410891089108912, "Math & Data Analysis": 1.6031746031746028 }, "total": 1024, "avg_len": 2919.423828125 }, "Mixtral-8x7B-Instruct-v0.1": { "model": "Mixtral-8x7B-Instruct-v0.1", "score": 6.75146771037182, "adjusted_score": 3.50293542074364, "task_macro_score": 3.147027304895869, "adjusted_task_macro_score": 3.147027304895869, "task_categorized_scores": { "Creative Tasks": 4.275324675324676, "Coding & Debugging": 2.5023696682464447, "Planning & Reasoning": 3.458646616541353, "Information/Advice seeking": 4.193548387096774, "Math & Data Analysis": 2.2142857142857135 }, "total": 1022, "avg_len": 2653.5813725490198 }, "Starling-LM-7B-beta-ExPO": { "model": "Starling-LM-7B-beta-ExPO", "score": 6.750733137829912, "adjusted_score": 3.5014662756598245, "task_macro_score": 3.1559353823619887, "adjusted_task_macro_score": 3.1559353823619887, "task_categorized_scores": { "Planning & Reasoning": 3.631736526946108, "Information/Advice seeking": 4.2871287128712865, "Coding & Debugging": 2.5308056872037916, "Math & Data Analysis": 1.8571428571428577, "Creative Tasks": 4.430051813471502 }, "total": 1023, "avg_len": 2835.826810176125 }, "reka-flash-20240226": { "model": "reka-flash-20240226", "score": 6.730205278592376, "adjusted_score": 3.460410557184751, "task_macro_score": 3.0363615402031146, "adjusted_task_macro_score": 3.0363615402031146, "task_categorized_scores": { "Planning & Reasoning": 3.501501501501501, "Information/Advice seeking": 4.153465346534654, "Coding & Debugging": 2.2085308056872037, "Math & Data Analysis": 2.048, "Creative Tasks": 4.244155844155845 }, "total": 1023, "avg_len": 2103.0098039215686 }, "Starling-LM-7B-beta": { "model": "Starling-LM-7B-beta", "score": 6.70869990224829, "adjusted_score": 3.417399804496579, "task_macro_score": 3.016944980829014, "adjusted_task_macro_score": 3.016944980829014, "task_categorized_scores": { "Planning & Reasoning": 3.405082212257101, "Information/Advice seeking": 4.188118811881187, "Coding & Debugging": 2.436018957345972, "Math & Data Analysis": 1.6984126984126977, "Creative Tasks": 4.379220779220779 }, "total": 1023, "avg_len": 2797.807240704501 }, "Nous-Hermes-2-Mixtral-8x7B-DPO": { "model": "Nous-Hermes-2-Mixtral-8x7B-DPO", "score": 6.6611165523996085, "adjusted_score": 3.322233104799217, "task_macro_score": 3.071140030667612, "adjusted_task_macro_score": 3.071140030667612, "task_categorized_scores": { "Creative Tasks": 3.792207792207792, "Coding & Debugging": 2.6037735849056602, "Planning & Reasoning": 3.424287856071963, "Information/Advice seeking": 3.9752475247524757, "Math & Data Analysis": 2.1752988047808763 }, "total": 1021, "avg_len": 2874.541625857003 }, "Meta-Llama-3-8B-Instruct": { "model": "Meta-Llama-3-8B-Instruct", "score": 6.658846529814272, "adjusted_score": 3.317693059628544, "task_macro_score": 2.920277208638918, "adjusted_task_macro_score": 2.920277208638918, "task_categorized_scores": { "Creative Tasks": 4.356589147286822, "Coding & Debugging": 2.19811320754717, "Planning & Reasoning": 3.4401197604790426, "Information/Advice seeking": 3.9306930693069315, "Math & Data Analysis": 1.6972111553784863 }, "total": 1023, "avg_len": 2975.1876832844573 }, "Hermes-2-Theta-Llama-3-8B": { "model": "Hermes-2-Theta-Llama-3-8B", "score": 6.64711632453568, "adjusted_score": 3.2942326490713594, "task_macro_score": 2.9635207776375476, "adjusted_task_macro_score": 2.9635207776375476, "task_categorized_scores": { "Creative Tasks": 3.9793281653746764, "Coding & Debugging": 2.3113207547169807, "Planning & Reasoning": 3.365269461077844, "Information/Advice seeking": 4.158415841584159, "Math & Data Analysis": 1.8725099601593627 }, "total": 1023, "avg_len": 2742.169110459433 }, "tulu-2-dpo-70b": { "model": "tulu-2-dpo-70b", "score": 6.6412512218963835, "adjusted_score": 3.282502443792767, "task_macro_score": 2.7983756123225105, "adjusted_task_macro_score": 2.7983756123225105, "task_categorized_scores": { "Planning & Reasoning": 3.230538922155688, "Information/Advice seeking": 4.0693069306930685, "Coding & Debugging": 2.0663507109004744, "Math & Data Analysis": 1.4841269841269842, "Creative Tasks": 4.270129870129869 }, "total": 1023, "avg_len": 2908.0714285714284 }, "gemma-2-2b-it": { "model": "gemma-2-2b-it", "score": 6.636007827788649, "adjusted_score": 3.272015655577299, "task_macro_score": 2.7826043214654264, "adjusted_task_macro_score": 2.7826043214654264, "task_categorized_scores": { "Creative Tasks": 4.361757105943152, "Coding & Debugging": 1.7904761904761912, "Planning & Reasoning": 3.3811659192825108, "Information/Advice seeking": 3.990099009900991, "Math & Data Analysis": 1.579365079365079 }, "total": 1022, "avg_len": 3589.3894324853227 }, "gpt-3.5-turbo-0125": { "model": "gpt-3.5-turbo-0125", "score": 6.613880742913001, "adjusted_score": 3.2277614858260026, "task_macro_score": 3.0015986071959313, "adjusted_task_macro_score": 3.0015986071959313, "task_categorized_scores": { "Creative Tasks": 3.7416020671834627, "Coding & Debugging": 2.654028436018958, "Planning & Reasoning": 3.3393124065769797, "Information/Advice seeking": 3.6485148514851478, "Math & Data Analysis": 2.158730158730158 }, "total": 1023, "avg_len": 1844.13880742913 }, "SELM-Zephyr-7B-iter-3": { "model": "SELM-Zephyr-7B-iter-3", "score": 6.576171875, "adjusted_score": 3.15234375, "task_macro_score": 2.5061899136983596, "adjusted_task_macro_score": 2.5061899136983596, "task_categorized_scores": { "Creative Tasks": 4.470284237726098, "Coding & Debugging": 1.1037735849056602, "Planning & Reasoning": 3.158682634730539, "Information/Advice seeking": 4.099009900990099, "Math & Data Analysis": 1.2669322709163353 }, "total": 1024, "avg_len": 2823.7800586510266 }, "Mistral-7B-Instruct-v0.2": { "model": "Mistral-7B-Instruct-v0.2", "score": 6.534701857282503, "adjusted_score": 3.0694037145650057, "task_macro_score": 2.563372831895388, "adjusted_task_macro_score": 2.563372831895388, "task_categorized_scores": { "Creative Tasks": 4.207253886010363, "Coding & Debugging": 1.8396226415094343, "Planning & Reasoning": 3.0059880239520957, "Information/Advice seeking": 4.009925558312656, "Math & Data Analysis": 1.007936507936508 }, "total": 1023, "avg_len": 2832.3440860215055 }, "neo_7b_instruct_v0.1": { "model": "neo_7b_instruct_v0.1", "score": 6.4599609375, "adjusted_score": 2.919921875, "task_macro_score": 2.5019233576987165, "adjusted_task_macro_score": 2.5019233576987165, "task_categorized_scores": { "Planning & Reasoning": 3.144992526158445, "Information/Advice seeking": 3.6336633663366342, "Coding & Debugging": 1.402843601895734, "Math & Data Analysis": 1.5, "Creative Tasks": 3.948186528497409 }, "total": 1024, "avg_len": 3735.800586510264 }, "neo_7b_instruct_v0.1-ExPO": { "model": "neo_7b_instruct_v0.1-ExPO", "score": 6.381231671554252, "adjusted_score": 2.7624633431085037, "task_macro_score": 2.3114172189706186, "adjusted_task_macro_score": 2.3114172189706186, "task_categorized_scores": { "Planning & Reasoning": 2.8669656203288483, "Information/Advice seeking": 3.4851485148514847, "Coding & Debugging": 1.276190476190477, "Math & Data Analysis": 1.2589641434262955, "Creative Tasks": 3.8549222797927456 }, "total": 1023, "avg_len": 4107.917808219178 }, "Qwen1.5-7B-Chat@together": { "model": "Qwen1.5-7B-Chat@together", "score": 6.36852394916911, "adjusted_score": 2.7370478983382203, "task_macro_score": 2.342316313940188, "adjusted_task_macro_score": 2.342316313940188, "task_categorized_scores": { "Creative Tasks": 3.829457364341085, "Coding & Debugging": 1.488151658767773, "Planning & Reasoning": 2.8878923766816147, "Information/Advice seeking": 3.400990099009901, "Math & Data Analysis": 1.1904761904761898 }, "total": 1023, "avg_len": 2519.4203323558163 }, "Llama-2-70b-chat-hf": { "model": "Llama-2-70b-chat-hf", "score": 6.345703125, "adjusted_score": 2.69140625, "task_macro_score": 2.0659636912866643, "adjusted_task_macro_score": 2.0659636912866643, "task_categorized_scores": { "Planning & Reasoning": 2.684684684684685, "Information/Advice seeking": 3.830845771144279, "Coding & Debugging": 0.9333333333333336, "Math & Data Analysis": 0.41767068273092356, "Creative Tasks": 4.0 }, "total": 1024, "avg_len": 3138.3179587831205 }, "Yi-1.5-6B-Chat": { "model": "Yi-1.5-6B-Chat", "score": 6.263929618768328, "adjusted_score": 2.5278592375366564, "task_macro_score": 2.3318116689149884, "adjusted_task_macro_score": 2.3318116689149884, "task_categorized_scores": { "Planning & Reasoning": 2.72972972972973, "Information/Advice seeking": 3.1414392059553347, "Coding & Debugging": 1.6587677725118475, "Math & Data Analysis": 1.6799999999999997, "Creative Tasks": 3.108808290155441 }, "total": 1023, "avg_len": 3899.4686274509804 }, "reka-edge": { "model": "reka-edge", "score": 6.159335288367546, "adjusted_score": 2.3186705767350926, "task_macro_score": 2.1252257932999665, "adjusted_task_macro_score": 2.1252257932999665, "task_categorized_scores": { "Planning & Reasoning": 2.5007727975270484, "Information/Advice seeking": 3.4389610389610397, "Coding & Debugging": 1.3526570048309186, "Math & Data Analysis": 0.8897959183673461, "Creative Tasks": 3.618037135278515 }, "total": 1023, "avg_len": 2417.351106639839 }, "Llama-2-7b-chat-hf": { "model": "Llama-2-7b-chat-hf", "score": 5.761252446183953, "adjusted_score": 1.5225048923679054, "task_macro_score": 0.8262075264042464, "adjusted_task_macro_score": 0.8262075264042464, "task_categorized_scores": { "Planning & Reasoning": 1.5428571428571427, "Information/Advice seeking": 2.766169154228855, "Coding & Debugging": -0.6794258373205739, "Math & Data Analysis": -0.7177419354838701, "Creative Tasks": 2.976623376623376 }, "total": 1022, "avg_len": 2985.1052114060963 }, "gemma-7b-it": { "model": "gemma-7b-it", "score": 5.5087890625, "adjusted_score": 1.017578125, "task_macro_score": 0.661975914869064, "adjusted_task_macro_score": 0.661975914869064, "task_categorized_scores": { "Planning & Reasoning": 1.0164424514200299, "Information/Advice seeking": 1.272277227722773, "Coding & Debugging": 0.18009478672985857, "Math & Data Analysis": -0.36507936507936556, "Creative Tasks": 2.119170984455959 }, "total": 1024, "avg_len": 1726.3440860215053 }, "gemma-2b-it": { "model": "gemma-2b-it", "score": 4.737512242899118, "adjusted_score": -0.5249755142017634, "task_macro_score": -0.9691930072258819, "adjusted_task_macro_score": -0.9691930072258819, "task_categorized_scores": { "Planning & Reasoning": -0.5795795795795797, "Information/Advice seeking": -0.2133995037220835, "Coding & Debugging": -1.7725118483412317, "Math & Data Analysis": -1.8645418326693228, "Creative Tasks": 0.7220779220779221 }, "total": 1021, "avg_len": 1590.0833333333333 } }