assets/aml-benchmark/scripts/data

# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. """Data Loading Script for MMLU.""" import csv import datasets _CITATION = """\ @article{hendryckstest2021, title={Measuring Massive Multitask Language Understanding}, author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and \ Jacob Steinhardt}, journal={Proceedings of the International Conference on Learning Representations (ICLR)}, year={2021} } """ _DESCRIPTION = """\ This is a massive multitask test consisting of multiple-choice questions from various \ branches of knowledge, covering 57 subjects including elementary mathematics, US history, \ computer science, law, and more.\ """ _HOMEPAGE = "https://github.com/hendrycks/test" _LICENSE = "MIT" _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" _SUBJECTS = [ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions", ] class MMLU(datasets.GeneratorBasedBuilder): """MMLU dataset.""" BUILDER_CONFIGS = [ datasets.BuilderConfig( name=sub, version=datasets.Version("1.0.0"), description=f"MMLU - Subject: {sub}", ) for sub in _SUBJECTS ] def _info(self): features = datasets.Features( { "question": datasets.Value("string"), "subject": datasets.Value("string"), "choices": datasets.features.Sequence(datasets.Value("string")), "answer": datasets.features.ClassLabel( num_classes=4, names=["A", "B", "C", "D"] ), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Return SplitGenerators.""" archive = dl_manager.download(_URL) iter_archive = dl_manager.iter_archive(archive) return [ datasets.SplitGenerator( name=datasets.Split("auxiliary_train"), gen_kwargs={ "iter_archive": iter_archive, "split": "auxiliary_train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "iter_archive": iter_archive, "split": "test", }, ), datasets.SplitGenerator( name=datasets.Split("val"), gen_kwargs={ "iter_archive": iter_archive, "split": "val", }, ), datasets.SplitGenerator( name=datasets.Split("dev"), gen_kwargs={ "iter_archive": iter_archive, "split": "dev", }, ), ] def _generate_examples(self, iter_archive, split): """Yield examples as a tuple.""" for id_file, (path, file) in enumerate(iter_archive): if f"data/{split}/" in path: if split == "auxiliary_train" or f"{self.config.name}_{split}.csv" in path: subject = self.config.name if split != "auxiliary_train" else "" lines = (line.decode("utf-8") for line in file) reader = csv.reader(lines) for id_line, data in enumerate(reader): yield f"{id_file}_{id_line}", { "question": data[0], "choices": data[1:5], "answer": data[5], "subject": subject, }

assets/aml-benchmark/scripts/data_loaders/mmlu.py (140 lines of code) (raw):