lm_eval/tasks/sciq/sciq.yaml (21 lines of code) (raw):

task: sciq dataset_path: sciq dataset_name: null output_type: multiple_choice training_split: train validation_split: validation test_split: test doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:" doc_to_target: 3 doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}" should_decontaminate: true doc_to_decontamination_query: "{{support}} {{question}}" metric_list: - metric: acc aggregation: mean higher_is_better: true - metric: acc_norm aggregation: mean higher_is_better: true metadata: version: 1.0