in tensorflow_datasets/text/gem/gem.py [0:0]
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
files = dl_manager.download_and_extract(self.builder_config.data_urls)
if self.builder_config.name == "common_gen":
challenge_sets = [
("challenge_train_sample", "train_common_gen_RandomSample500.json"),
("challenge_validation_sample",
"validation_common_gen_RandomSample500.json"),
("challenge_test_scramble",
"test_common_gen_ScrambleInputStructure500.json"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath":
os.path.join(files["data"], "commongen.train.jsonl"),
"set_name":
"train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath":
os.path.join(files["data"], "commongen.dev.jsonl"),
"set_name":
"validation",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath":
os.path.join(files["data"], "commongen.test_noref.jsonl"),
"set_name":
"test",
},
),
] + challenge_splits
elif self.builder_config.name == "cs_restaurants":
challenge_sets = [
("challenge_train_sample",
"train_cs_restaurants_RandomSample500.json"),
("challenge_validation_sample",
"validation_cs_restaurants_RandomSample500.json"),
("challenge_test_scramble",
"test_cs_restaurants_ScrambleInputStructure500.json"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["train"],
"set_name": "train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["validation"],
"set_name": "validation",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": files["test"],
"set_name": "test",
},
),
] + challenge_splits
elif self.builder_config.name == "dart":
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["train"],
"set_name": "train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["validation"],
"set_name": "validation",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": files["test"],
"set_name": "test",
},
),
]
elif self.builder_config.name == "e2e_nlg":
challenge_sets = [
("challenge_train_sample", "train_e2e_nlg_RandomSample500.json"),
("challenge_validation_sample",
"validation_e2e_nlg_RandomSample500.json"),
("challenge_test_scramble",
"test_e2e_nlg_ScrambleInputStructure500.json"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["train"],
"set_name": "train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["validation"],
"set_name": "validation",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": files["test"],
"set_name": "test",
},
),
] + challenge_splits
elif self.builder_config.name.startswith("mlsum"):
# Can be either _de or _es.
lang = self.builder_config.name.split("_")[1]
challenge_sets = [
("challenge_train_sample",
f"train_mlsum_{lang}_RandomSample500.json"),
("challenge_validation_sample",
f"validation_mlsum_{lang}_RandomSample500.json"),
("challenge_test_covid", f"{lang}_test_covid19_cleaned.jsonl"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath":
os.path.join(files["train"], lang + "_train.jsonl"),
"set_name":
"train",
"lang":
lang,
"filepaths":
files["bad_ids"]
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath":
os.path.join(files["validation"], lang + "_val.jsonl"),
"set_name":
"validation",
"lang":
lang,
"filepaths":
files["bad_ids"]
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": os.path.join(files["test"], lang + "_test.jsonl"),
"set_name": "test",
"lang": lang,
"filepaths": files["bad_ids"]
},
),
] + challenge_splits
elif self.builder_config.name == "schema_guided_dialog":
challenge_sets = [
("challenge_train_sample",
"train_schema_guided_dialog_RandomSample500_reformatted.json"),
("challenge_validation_sample",
"validation_schema_guided_dialog_RandomSample500_reformatted.json"),
("challenge_test_backtranslation",
"test_schema_guided_dialog_BackTranslation500_reformatted.json"),
(
"challenge_test_bfp02",
"test_schema_guided_dialog_ButterFingersPerturbation_p=0.02_500_reformatted.json",
),
(
"challenge_test_bfp05",
"test_schema_guided_dialog_ButterFingersPerturbation_p=0.05_500_reformatted.json",
),
("challenge_test_nopunc",
"test_schema_guided_dialog_WithoutPunctuation500_reformatted.json"),
("challenge_test_scramble",
"test_schema_guided_dialog_ScrambleInputStructure500_reformatted.json"
),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
generators = []
for tfds_spl, spl in zip(
[tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST],
["train", "validation", "test"]):
generators.append(
tfds.core.SplitGenerator(
name=tfds_spl,
gen_kwargs={
"filepath": os.path.join(files["data"], "gem_sgd.json"),
"set_name": spl
}))
return generators + challenge_splits
elif self.builder_config.name == "totto":
challenge_sets = [
("challenge_train_sample", "train_totto_RandomSample500.json"),
("challenge_validation_sample",
"validation_totto_RandomSample500.json"),
("challenge_test_scramble",
"test_totto_ScrambleInputStructure500.json"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath":
os.path.join(files["data"],
"totto_data/totto_train_data.jsonl"),
"set_name":
"train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath":
os.path.join(files["data"],
"totto_data/totto_dev_data.jsonl"),
"set_name":
"validation",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath":
os.path.join(
files["data"],
"totto_data/unlabeled_totto_test_data.jsonl"),
"set_name":
"test",
},
),
] + challenge_splits
elif self.builder_config.name.startswith("web_nlg"):
# Can be either _en or _ru.
ln = self.builder_config.name.split("_")[2]
challenge_sets = [
("challenge_train_sample",
f"train_web_nlg_{ln}_RandomSample500.json"),
("challenge_validation_sample",
f"validation_web_nlg_{ln}_RandomSample500.json"),
("challenge_test_scramble",
f"test_web_nlg_{ln}_ScrambleInputStructure500.json"),
]
if ln == "en":
challenge_sets += [("challenge_test_numbers",
f"test_web_nlg_{ln}_replace_numbers_500.json")]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["train"],
"set_name": "train"
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["validation"],
"set_name": "validation"
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": files["test"],
"set_name": "test"
},
),
] + challenge_splits
elif self.builder_config.name == "wiki_auto_asset_turk":
challenge_sets = [
("challenge_train_sample",
"train_wiki_auto_asset_turk_RandomSample500.json"),
("challenge_validation_sample",
"validation_wiki_auto_asset_turk_RandomSample500.json"),
("challenge_test_asset_backtranslation",
"test_asset_wiki_auto_asset_turk_BackTranslation.json"),
(
"challenge_test_asset_bfp02",
"test_asset_wiki_auto_asset_turk_ButterFingersPerturbation_p=0.02.json",
),
(
"challenge_test_asset_bfp05",
"test_asset_wiki_auto_asset_turk_ButterFingersPerturbation_p=0.05.json",
),
("challenge_test_asset_nopunc",
"test_asset_wiki_auto_asset_turk_WithoutPunctuation.json"),
("challenge_test_turk_backtranslation",
"detok_test_turk_wiki_auto_asset_turk_BackTranslation.json"),
(
"challenge_test_turk_bfp02",
"detok_test_turk_wiki_auto_asset_turk_ButterFingersPerturbation_p=0.02.json",
),
(
"challenge_test_turk_bfp05",
"detok_test_turk_wiki_auto_asset_turk_ButterFingersPerturbation_p=0.05.json",
),
("challenge_test_turk_nopunc",
"detok_test_turk_wiki_auto_asset_turk_WithoutPunctuation.json"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["train"],
"set_name": "train",
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["validation"],
"set_name": "validation",
},
),
tfds.core.SplitGenerator(
name="test_asset",
gen_kwargs={
"filepath":
"",
"set_name":
"test_asset",
"filepaths":
[files["test_asset_" + str(i)] for i in range(10)],
},
),
tfds.core.SplitGenerator(
name="test_turk",
gen_kwargs={
"filepath": files["test_turk"],
"set_name": "test_turk",
},
),
] + challenge_splits
elif self.builder_config.name.startswith("wiki_lingua"):
lang_name = self.builder_config.name.split("_")[-2]
lang = self.builder_config.name.split("_")[-1]
base_dir = os.path.join(files["data"], lang_name)
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": base_dir,
"set_name": "train",
"lang": lang,
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": base_dir,
"set_name": "val",
"lang": lang,
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": base_dir,
"set_name": "test",
"lang": lang,
},
),
]
elif self.builder_config.name == "xsum":
challenge_sets = [
("challenge_train_sample", "train_xsum_RandomSample500.json"),
("challenge_validation_sample",
"validation_xsum_RandomSample500.json"),
("challenge_test_backtranslation",
"test_xsum_BackTranslation500.json"),
("challenge_test_bfp_02",
"test_xsum_ButterFingersPerturbation_p=0.02_500.json"),
("challenge_test_bfp_05",
"test_xsum_ButterFingersPerturbation_p=0.05_500.json"),
("challenge_test_nopunc", "test_xsum_WithoutPunctuation500.json"),
("challenge_test_covid", "en_test_covid19.jsonl"),
]
challenge_splits = []
for challenge_split, filename in challenge_sets:
challenge_splits.append(
tfds.core.SplitGenerator(
name=challenge_split,
gen_kwargs={
"filepath":
os.path.join(files["challenge_set"],
self.builder_config.name, filename),
"set_name":
challenge_split,
},
))
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"filepath": files["splits"],
"set_name": "train",
"filepaths": os.path.join(files["data"], "bbc-summary-data"),
},
),
tfds.core.SplitGenerator(
name=tfds.Split.VALIDATION,
gen_kwargs={
"filepath": files["splits"],
"set_name": "validation",
"filepaths": os.path.join(files["data"], "bbc-summary-data"),
},
),
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"filepath": files["splits"],
"set_name": "test",
"filepaths": os.path.join(files["data"], "bbc-summary-data"),
},
),
] + challenge_splits