lm_eval/tasks/piqa/piqa.yaml (23 lines of code) (raw):

task: piqa dataset_path: piqa dataset_name: null output_type: multiple_choice training_split: train validation_split: validation test_split: null doc_to_text: "Question: {{goal}}\nAnswer:" doc_to_target: label doc_to_choice: "{{[sol1, sol2]}}" should_decontaminate: true doc_to_decontamination_query: goal metric_list: - metric: acc aggregation: mean higher_is_better: true - metric: acc_norm aggregation: mean higher_is_better: true metadata: version: 1.0 dataset_kwargs: trust_remote_code: true