lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml (36 lines of code) (raw):
group: bbh
task:
- bbh_cot_fewshot_boolean_expressions
- bbh_cot_fewshot_causal_judgement
- bbh_cot_fewshot_date_understanding
- bbh_cot_fewshot_disambiguation_qa
- bbh_cot_fewshot_dyck_languages
- bbh_cot_fewshot_formal_fallacies
- bbh_cot_fewshot_geometric_shapes
- bbh_cot_fewshot_hyperbaton
- bbh_cot_fewshot_logical_deduction_five_objects
- bbh_cot_fewshot_logical_deduction_seven_objects
- bbh_cot_fewshot_logical_deduction_three_objects
- bbh_cot_fewshot_movie_recommendation
- bbh_cot_fewshot_multistep_arithmetic_two
- bbh_cot_fewshot_navigate
- bbh_cot_fewshot_object_counting
- bbh_cot_fewshot_penguins_in_a_table
- bbh_cot_fewshot_reasoning_about_colored_objects
- bbh_cot_fewshot_ruin_names
- bbh_cot_fewshot_salient_translation_error_detection
- bbh_cot_fewshot_snarks
- bbh_cot_fewshot_sports_understanding
- bbh_cot_fewshot_temporal_sequences
- bbh_cot_fewshot_tracking_shuffled_objects_five_objects
- bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
- bbh_cot_fewshot_tracking_shuffled_objects_three_objects
- bbh_cot_fewshot_web_of_lies
- bbh_cot_fewshot_word_sorting
aggregate_metric_list:
- metric: exact_match
aggregation: mean
weight_by_size: true
filter_list: get-answer
metadata:
version: 2.0