entries: [ # auto_debugging {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=auto_debugging,subtask=,max_eval_instances=18",priority: 1} # code_line_description {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=code_line_description,subtask=,max_eval_instances=19",priority: 1} # conceptual_combinations {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=contradictions,max_eval_instances=3",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=emergent_properties,max_eval_instances=3",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=fanciful_fictional_combinations,max_eval_instances=4",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=homonyms,max_eval_instances=4",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=invented_words,max_eval_instances=4",priority: 1} # emoji_movie {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=emoji_movie,subtask=,max_eval_instances=19",priority: 1} # formal_fallacies_syllogisms_negation {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=formal_fallacies_syllogisms_negation,subtask=,max_eval_instances=19",priority: 1} # known_unknowns {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=known_unknowns,subtask=,max_eval_instances=19",priority: 1} # linguistics_puzzles {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=linguistics_puzzles,subtask=,max_eval_instances=18",priority: 1} # logic_grid_puzzle {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logic_grid_puzzle,subtask=,max_eval_instances=18",priority: 1} # logical_deduction {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=three_objects,max_eval_instances=6",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=five_objects,max_eval_instances=6",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=seven_objects,max_eval_instances=6",priority: 1} # novel_concepts {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=novel_concepts,subtask=,max_eval_instances=18",priority: 1} # operator {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=operators,subtask=,max_eval_instances=18",priority: 1} # play_dialog_same_or_different {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=play_dialog_same_or_different,subtask=,max_eval_instances=18",priority: 1} # repeat_copy_logic {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=repeat_copy_logic,subtask=,max_eval_instances=18",priority: 1} # strange_stories {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=boolean,max_eval_instances=9",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=multiple_choice,max_eval_instances=9",priority: 1} # strategyqa {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strategyqa,subtask=,max_eval_instances=18",priority: 1} # symbol_interpretation {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=adversarial,max_eval_instances=3",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=emoji_agnostic,max_eval_instances=3",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=name_agnostic,max_eval_instances=4",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=plain,max_eval_instances=4",priority: 1} {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=tricky,max_eval_instances=4",priority: 1} # vitaminc_fact_verification {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=vitaminc_fact_verification,subtask=,max_eval_instances=18",priority: 1} # winowhy {description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=winowhy,subtask=,max_eval_instances=19",priority: 1} # medicine_biology {description: "mmlu:model=neurips/local,subject=anatomy,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=college_medicine,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=college_biology,data_augmentation=canonical,max_eval_instances=5",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_biology,data_augmentation=canonical,max_eval_instances=5",priority: 1} # computer_science {description: "mmlu:model=neurips/local,subject=college_computer_science,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_computer_science,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=computer_security,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=electrical_engineering,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=machine_learning,data_augmentation=canonical,max_eval_instances=4",priority: 1} # math {description: "mmlu:model=neurips/local,subject=high_school_mathematics,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=college_mathematics,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=abstract_algebra,data_augmentation=canonical,max_eval_instances=5",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_statistics,data_augmentation=canonical,max_eval_instances=5",priority: 1} # physics_chemistry {description: "mmlu:model=neurips/local,subject=college_chemistry,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_chemistry,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_physics,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=college_physics,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=astronomy,data_augmentation=canonical,max_eval_instances=4",priority: 1} # formal_reasoning {description: "mmlu:model=neurips/local,subject=formal_logic,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=logical_fallacies,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=philosophy,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=moral_disputes,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=moral_scenarios,data_augmentation=canonical,max_eval_instances=4",priority: 1} # law {description: "mmlu:model=neurips/local,subject=professional_law,data_augmentation=canonical,max_eval_instances=6",priority: 1} {description: "mmlu:model=neurips/local,subject=international_law,data_augmentation=canonical,max_eval_instances=6",priority: 1} {description: "mmlu:model=neurips/local,subject=jurisprudence,data_augmentation=canonical,max_eval_instances=6",priority: 1} # history {description: "mmlu:model=neurips/local,subject=high_school_european_history,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_us_history,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_world_history,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=prehistory,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=world_religions,data_augmentation=canonical,max_eval_instances=4",priority: 1} # business {description: "mmlu:model=neurips/local,subject=business_ethics,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=global_facts,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=management,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=marketing,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=miscellaneous,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=professional_accounting,data_augmentation=canonical,max_eval_instances=3",priority: 1} # health {description: "mmlu:model=neurips/local,subject=nutrition,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=human_aging,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=clinical_knowledge,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=medical_genetics,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=professional_medicine,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=virology,data_augmentation=canonical,max_eval_instances=3",priority: 1} # social_studies {description: "mmlu:model=neurips/local,subject=high_school_government_and_politics,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_geography,data_augmentation=canonical,max_eval_instances=3",priority: 1} {description: "mmlu:model=neurips/local,subject=us_foreign_policy,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=public_relations,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=security_studies,data_augmentation=canonical,max_eval_instances=4",priority: 1} # human_behavior {description: "mmlu:model=neurips/local,subject=high_school_psychology,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=human_sexuality,data_augmentation=canonical,max_eval_instances=4",priority: 1} {description: "mmlu:model=neurips/local,subject=professional_psychology,data_augmentation=canonical,max_eval_instances=5",priority: 1} {description: "mmlu:model=neurips/local,subject=sociology,data_augmentation=canonical,max_eval_instances=5",priority: 1} # economics {description: "mmlu:model=neurips/local,subject=high_school_microeconomics,data_augmentation=canonical,max_eval_instances=6",priority: 1} {description: "mmlu:model=neurips/local,subject=econometrics,data_augmentation=canonical,max_eval_instances=6",priority: 1} {description: "mmlu:model=neurips/local,subject=high_school_macroeconomics,data_augmentation=canonical,max_eval_instances=6",priority: 1} # truthful_qa {description: "truthful_qa:task=mc_single,model=neurips/local,max_eval_instances=9",priority: 1} {description: "summarization_cnndm:model=neurips/local,max_eval_instances=9",priority: 1} # gsm {description: "gsm:model=neurips/local,max_eval_instances=19",priority: 1} # bbq {description: "bbq:subject=all,model=neurips/local,max_eval_instances=18",priority: 1} ]