- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *. files (419): tests/testdata/qa4mre_2011-v0-loglikelihood tests/testdata/pile_freelaw-v0-loglikelihood_rolling tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood tests/testdata/openbookqa-v0-loglikelihood tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood tests/testdata/hendrycksTest-astronomy-v0-loglikelihood tests/testdata/cb-v1-loglikelihood tests/testdata/sciq-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood tests/testdata/crows_pairs_english_gender-v0-loglikelihood tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling tests/testdata/math_num_theory-v1-greedy_until tests/testdata/lambada_openai_mt_it-v0-loglikelihood tests/testdata/blimp_drop_argument-v0-loglikelihood tests/testdata/blimp_only_npi_scope-v0-loglikelihood tests/testdata/wmt20-de-en-v0-greedy_until tests/testdata/triviaqa-v0-loglikelihood tests/testdata/arithmetic_3da-v0-loglikelihood tests/testdata/pile_philpapers-v0-loglikelihood_rolling tests/testdata/wmt20-en-zh-v1-greedy_until tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling tests/testdata/wmt20-ta-en-v0-greedy_until tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood tests/testdata/webqs-v0-loglikelihood tests/testdata/crows_pairs_french_religion-v0-loglikelihood tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood tests/testdata/blimp_transitive-v0-loglikelihood tests/testdata/arithmetic_1dc-v0-loglikelihood tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling tests/testdata/wmt20-en-ja-v0-greedy_until tests/testdata/pile_github-v1-loglikelihood_rolling tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling tests/testdata/headqa_es-v0-loglikelihood tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood tests/testdata/coqa-v1-greedy_until tests/testdata/pile_wikipedia-v0-loglikelihood_rolling tests/testdata/hendrycksTest-computer_security-v0-loglikelihood tests/testdata/pile_arxiv-v0-loglikelihood_rolling tests/testdata/piqa-v0-loglikelihood tests/testdata/race-v0-loglikelihood tests/testdata/mutual_plus-v1-loglikelihood tests/testdata/drop-v0-greedy_until tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling tests/testdata/lambada_openai_mt_es-v0-loglikelihood tests/testdata/hendrycksTest-college_biology-v0-loglikelihood tests/testdata/crows_pairs_french_sexual_orientation-v0-loglikelihood tests/testdata/crows_pairs_english_autre-v0-loglikelihood tests/testdata/blimp_npi_present_2-v0-loglikelihood tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood tests/testdata/arithmetic_5da-v0-loglikelihood tests/testdata/wmt20-pl-en-v0-greedy_until tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood tests/testdata/arithmetic_2da-v0-loglikelihood tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood tests/testdata/blimp_adjunct_island-v0-loglikelihood tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood tests/testdata/squad2-v0-loglikelihood tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood tests/testdata/lambada_standard_cloze-v0-loglikelihood tests/testdata/random_insertion-v0-greedy_until tests/testdata/sst-v0-loglikelihood tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood tests/testdata/hendrycksTest-global_facts-v0-loglikelihood tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood tests/testdata/pubmedqa-v0-loglikelihood tests/testdata/math_prealgebra-v0-greedy_until tests/testdata/arithmetic_3ds-v0-loglikelihood tests/testdata/drop-v1-greedy_until tests/testdata/blimp_animate_subject_trans-v0-loglikelihood tests/testdata/squad2-v1-loglikelihood tests/testdata/wmt20-en-pl-v0-greedy_until tests/testdata/pile_enron-v0-loglikelihood_rolling tests/testdata/math_precalc-v0-greedy_until tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood tests/testdata/math_geometry-v1-greedy_until tests/testdata/lambada_openai_mt_en-v0-loglikelihood tests/testdata/pile_wikipedia-v1-loglikelihood_rolling tests/testdata/wmt20-en-cs-v0-greedy_until tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood tests/testdata/squad2-v0-greedy_until tests/testdata/mc_taco-v0-loglikelihood tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood tests/testdata/wmt16-en-ro-v0-greedy_until tests/testdata/wikitext-v1-loglikelihood_rolling tests/testdata/pile_github-v0-loglikelihood_rolling tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood tests/testdata/ethics_cm-v0-loglikelihood tests/testdata/blimp_principle_A_case_1-v0-loglikelihood tests/testdata/blimp_inchoative-v0-loglikelihood tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood tests/testdata/rte-v0-loglikelihood tests/testdata/lambada_mt_de-v0-loglikelihood tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood tests/testdata/crows_pairs_english_socioeconomic-v0-loglikelihood tests/testdata/blimp_passive_1-v0-loglikelihood tests/testdata/math_algebra-v1-greedy_until tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood tests/testdata/crows_pairs_english-v0-loglikelihood tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood tests/testdata/wmt20-ps-en-v0-greedy_until tests/testdata/math_geometry-v0-greedy_until tests/testdata/wmt16-en-de-v0-greedy_until tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood tests/testdata/crows_pairs_english_religion-v0-loglikelihood tests/testdata/mutual_plus-v0-loglikelihood tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling tests/testdata/hendrycksTest-prehistory-v0-loglikelihood tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood tests/testdata/logiqa-v0-loglikelihood tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood tests/testdata/hendrycksTest-human_aging-v0-loglikelihood tests/testdata/wmt20-ja-en-v0-greedy_until tests/testdata/wmt20-en-ps-v0-greedy_until tests/testdata/truthfulqa_gen-v1-greedy_until tests/testdata/ethics_deontology-v0-loglikelihood tests/testdata/blimp_npi_present_1-v0-loglikelihood tests/testdata/crows_pairs_french-v0-loglikelihood tests/testdata/pile_freelaw-v1-loglikelihood_rolling tests/testdata/arithmetic_2ds-v0-loglikelihood tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood tests/testdata/pile_hackernews-v1-loglikelihood_rolling tests/testdata/cola-v0-loglikelihood tests/testdata/gsm8k-v0-greedy_until tests/testdata/wmt14-en-fr-v0-greedy_until tests/testdata/crows_pairs_english_disability-v0-loglikelihood tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood tests/testdata/wmt20-en-zh-v0-greedy_until tests/testdata/crows_pairs_english_sexual_orientation-v0-loglikelihood tests/testdata/wsc-v0-loglikelihood tests/testdata/mnli_mismatched-v0-loglikelihood tests/testdata/blimp_sentential_subject_island-v0-loglikelihood tests/testdata/wikitext-v0-loglikelihood_rolling tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood tests/testdata/hendrycksTest-anatomy-v0-loglikelihood tests/testdata/toxigen-v0-loglikelihood tests/testdata/mnli-v0-loglikelihood tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling tests/testdata/qqp-v0-loglikelihood tests/testdata/pile_arxiv-v1-loglikelihood_rolling tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood tests/testdata/pile_books3-v1-loglikelihood_rolling tests/testdata/wmt20-en-de-v0-greedy_until tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood tests/testdata/hendrycksTest-public_relations-v0-loglikelihood tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood tests/testdata/arc_easy-v0-loglikelihood tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood tests/testdata/math_prealgebra-v1-greedy_until tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood tests/testdata/lambada_openai-v0-loglikelihood tests/testdata/wmt14-fr-en-v0-greedy_until tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood tests/testdata/math_intermediate_algebra-v0-greedy_until tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood tests/testdata/math_counting_and_prob-v0-greedy_until tests/testdata/qnli-v0-loglikelihood tests/testdata/wmt20-km-en-v0-greedy_until tests/testdata/crows_pairs_english_age-v0-loglikelihood tests/testdata/wmt20-en-iu-v0-greedy_until tests/testdata/pile_europarl-v0-loglikelihood_rolling tests/testdata/wnli-v0-loglikelihood tests/testdata/wic-v0-loglikelihood tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood tests/testdata/coqa-v0-greedy_until tests/testdata/arc_challenge-v0-loglikelihood tests/testdata/ethics_utilitarianism_original-v0-loglikelihood tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood tests/testdata/wmt16-ro-en-v0-greedy_until tests/testdata/anli_r3-v0-loglikelihood tests/testdata/cb-v0-loglikelihood tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling tests/testdata/wmt20-ru-en-v0-greedy_until tests/testdata/truthfulqa_mc-v0-loglikelihood tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood tests/testdata/blimp_principle_A_c_command-v0-loglikelihood tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood tests/testdata/pile_books3-v0-loglikelihood_rolling tests/testdata/blimp_animate_subject_passive-v0-loglikelihood tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling tests/testdata/blimp_intransitive-v0-loglikelihood tests/testdata/arithmetic_4ds-v0-loglikelihood tests/testdata/hendrycksTest-security_studies-v0-loglikelihood tests/testdata/pile_pile-cc-v0-loglikelihood_rolling tests/testdata/wmt20-en-ta-v0-greedy_until tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood tests/testdata/arithmetic_2dm-v0-loglikelihood tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood tests/testdata/pile_gutenberg-v1-loglikelihood_rolling tests/testdata/truthfulqa_mc-v1-loglikelihood tests/testdata/crows_pairs_french_autre-v0-loglikelihood tests/testdata/anagrams1-v0-greedy_until tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood tests/testdata/wnli-v1-loglikelihood tests/testdata/anagrams2-v0-greedy_until tests/testdata/mathqa-v0-loglikelihood tests/testdata/prost-v0-loglikelihood tests/testdata/lambada_mt_it-v0-loglikelihood tests/testdata/crows_pairs_french_disability-v0-loglikelihood tests/testdata/blimp_causative-v0-loglikelihood tests/testdata/crows_pairs_english_nationality-v0-loglikelihood tests/testdata/hendrycksTest-econometrics-v0-loglikelihood tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood tests/testdata/crows_pairs_french_physical_appearance-v0-loglikelihood tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood tests/testdata/hendrycksTest-philosophy-v0-loglikelihood tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood tests/testdata/pile_europarl-v1-loglikelihood_rolling tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood tests/testdata/ethics_justice-v0-loglikelihood tests/testdata/blimp_complex_NP_island-v0-loglikelihood tests/testdata/hellaswag-v0-loglikelihood tests/testdata/blimp_principle_A_case_2-v0-loglikelihood tests/testdata/wmt20-cs-en-v0-greedy_until tests/testdata/hendrycksTest-college_physics-v0-loglikelihood tests/testdata/pile_stackexchange-v1-loglikelihood_rolling tests/testdata/record-v0-loglikelihood tests/testdata/lambada_mt_fr-v0-loglikelihood tests/testdata/swag-v0-loglikelihood tests/testdata/boolq-v1-loglikelihood tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood tests/testdata/pile_uspto-v0-loglikelihood_rolling tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling tests/testdata/wsc273-v0-loglikelihood tests/testdata/cycle_letters-v0-greedy_until tests/testdata/pile_stackexchange-v0-loglikelihood_rolling tests/testdata/pile_hackernews-v0-loglikelihood_rolling tests/testdata/headqa-v0-loglikelihood tests/testdata/math_intermediate_algebra-v1-greedy_until tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood tests/testdata/mutual-v0-loglikelihood tests/testdata/hendrycksTest-world_religions-v0-loglikelihood tests/testdata/squad2-v1-greedy_until tests/testdata/ethics_utilitarianism-v0-loglikelihood tests/testdata/hendrycksTest-professional_law-v0-loglikelihood tests/testdata/triviaqa-v1-loglikelihood tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling tests/testdata/pile_pile-cc-v1-loglikelihood_rolling tests/testdata/math_precalc-v1-greedy_until tests/testdata/crows_pairs_english_race_color-v0-loglikelihood tests/testdata/copa-v0-loglikelihood tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood tests/testdata/blimp_passive_2-v0-loglikelihood tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling tests/testdata/crows_pairs_french_race_color-v0-loglikelihood tests/testdata/ethics_virtue-v0-loglikelihood tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood tests/testdata/wmt16-de-en-v0-greedy_until tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling tests/testdata/hendrycksTest-management-v0-loglikelihood tests/testdata/pile_enron-v1-loglikelihood_rolling tests/testdata/pile_uspto-v1-loglikelihood_rolling tests/testdata/iwslt17-en-ar-v0-greedy_until tests/testdata/lambada_standard-v0-loglikelihood tests/testdata/blimp_wh_island-v0-loglikelihood tests/testdata/iwslt17-ar-en-v0-greedy_until tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling tests/testdata/multirc-v1-loglikelihood tests/testdata/crows_pairs_french_nationality-v0-loglikelihood tests/testdata/lambada_mt_en-v0-loglikelihood tests/testdata/qa4mre_2012-v0-loglikelihood tests/testdata/boolq-v0-loglikelihood tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood tests/testdata/headqa_en-v0-loglikelihood tests/testdata/lambada_openai_cloze-v0-loglikelihood tests/testdata/math_algebra-v0-greedy_until tests/testdata/hendrycksTest-sociology-v0-loglikelihood tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood tests/testdata/lambada-v0-loglikelihood tests/testdata/pile_gutenberg-v0-loglikelihood_rolling tests/testdata/wmt20-iu-en-v0-greedy_until tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood tests/testdata/mrpc-v0-loglikelihood tests/testdata/wmt20-de-fr-v0-greedy_until tests/testdata/lambada_mt_es-v0-loglikelihood tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood tests/testdata/multirc-v0-loglikelihood tests/testdata/anli_r1-v0-loglikelihood tests/testdata/lambada_openai_mt_fr-v0-loglikelihood tests/testdata/math_num_theory-v0-greedy_until tests/testdata/hendrycksTest-marketing-v0-loglikelihood tests/testdata/crows_pairs_french_gender-v0-loglikelihood tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood tests/testdata/crows_pairs_french_age-v0-loglikelihood tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood tests/testdata/wmt20-fr-de-v0-greedy_until tests/testdata/pile_philpapers-v1-loglikelihood_rolling tests/testdata/math_counting_and_prob-v1-greedy_until tests/testdata/lambada_cloze-v0-loglikelihood tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood tests/testdata/arithmetic_5ds-v0-loglikelihood tests/testdata/mutual-v1-loglikelihood tests/testdata/hendrycksTest-international_law-v0-loglikelihood tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood tests/testdata/hendrycksTest-virology-v0-loglikelihood tests/testdata/truthfulqa_gen-v0-greedy_until tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood tests/testdata/hendrycksTest-nutrition-v0-loglikelihood tests/testdata/lambada_openai_mt_de-v0-loglikelihood tests/testdata/wmt20-en-ja-v1-greedy_until tests/testdata/wmt20-zh-en-v0-greedy_until tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling tests/testdata/anli_r2-v0-loglikelihood tests/testdata/reversed_words-v0-greedy_until tests/testdata/winogrande-v0-loglikelihood tests/testdata/wmt20-en-ru-v0-greedy_until tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood tests/testdata/wmt20-en-km-v0-greedy_until tests/testdata/arithmetic_4da-v0-loglikelihood tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood tests/testdata/qa4mre_2013-v0-loglikelihood lm_eval/tasks/translation/wmt_common_yaml lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml lm_eval/tasks/eus_exams/eus_exams lm_eval/tasks/eus_exams/eus_exams_eu lm_eval/tasks/eus_exams/eus_exams_es lm_eval/tasks/model_written_evals/persona/_template_yaml lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml lm_eval/tasks/haerae/_default_haerae_yaml lm_eval/tasks/benchmarks/flan/_held_in_template_yaml lm_eval/tasks/xnli/xnli_common_yaml lm_eval/tasks/tmmluplus/default/_default_template_yaml lm_eval/tasks/leaderboard/musr/_template_yaml lm_eval/tasks/leaderboard/math/_template_yaml lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml lm_eval/tasks/leaderboard/gpqa/_template_yaml lm_eval/tasks/paws-x/pawsx_template_yaml lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml lm_eval/tasks/mmlu/default/_default_template_yaml lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml lm_eval/tasks/mmlu/continuation/_continuation_template_yaml lm_eval/tasks/mmlu/generative/_default_template_yaml lm_eval/tasks/csatqa/_default_csatqa_yaml lm_eval/tasks/unitxt/unitxt lm_eval/tasks/french_bench/_default_template_yaml lm_eval/tasks/blimp/_template_yaml lm_eval/tasks/ceval/_default_ceval_yaml lm_eval/tasks/afrimgsm/translate/translate_direct_yaml lm_eval/tasks/afrimgsm/direct/direct_yaml lm_eval/tasks/afrimgsm/en_cot/cot_yaml lm_eval/tasks/xwinograd/xwinograd_common_yaml lm_eval/tasks/bigbench/multiple_choice_template_b_yaml lm_eval/tasks/bigbench/generate_until_template_yaml lm_eval/tasks/bigbench/multiple_choice_template_a_yaml lm_eval/tasks/mgsm/direct/direct_yaml lm_eval/tasks/mgsm/native_cot/cot_yaml lm_eval/tasks/mgsm/en_cot/cot_yaml lm_eval/tasks/okapi/hellaswag_multilingual/_hellaswag_yaml lm_eval/tasks/okapi/arc_multilingual/_arc_yaml lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc2_yaml lm_eval/tasks/okapi/truthfulqa_multilingual/_truthfulqa_mc1_yaml lm_eval/tasks/mmlusr/question_only/_mmlusr_q_yml lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml lm_eval/tasks/mmlusr/answer_only/_mmlusr_a_yml lm_eval/tasks/aclue/_default_template_yaml lm_eval/tasks/xnli_eu/xnli_common_yaml lm_eval/tasks/belebele/_default_template_yaml lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml lm_eval/tasks/paloma/_paloma_template lm_eval/tasks/inverse_scaling/_some_results lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml lm_eval/tasks/bertaqa/_bertaqa_template lm_eval/tasks/wmdp/_default_template_yaml lm_eval/tasks/med_concepts_qa/_default_template_yaml lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml lm_eval/tasks/aexams/_default_template_yaml lm_eval/tasks/cmmlu/_default_template_yaml lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yaml lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yaml lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yaml lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yaml lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yaml lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml CODEOWNERS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.yaml files (3): .github/workflows/unit_tests.yml .github/workflows/publish.yml .github/workflows/new_tasks.yml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.0-loglikelihood files (2): tests/testdata/arc_challenge-v2.0-loglikelihood tests/testdata/lambada_openai-v2.0-loglikelihood - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.tsv files (1): lm_eval/tasks/tmmluplus/subject.tsv - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.png files (1): docs/img/fewshot_example_gpt3.png - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.coveragerc files (1): .coveragerc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - *.flake8 files (1): .flake8 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -