in tensorflow_datasets/translate/wmt.py [0:0]
def _generate_examples(self, split_subsets, extraction_map):
"""Returns the examples in the raw (text) form."""
source, _ = self.builder_config.language_pair
def _get_local_paths(ds, extract_dirs):
rel_paths = ds.get_path(source)
if len(extract_dirs) == 1:
extract_dirs = extract_dirs * len(rel_paths)
return [
os.path.join(ex_dir, rel_path) if rel_path else ex_dir
for ex_dir, rel_path in zip(extract_dirs, rel_paths)
]
for ss_name in split_subsets:
logging.info("Generating examples from: %s", ss_name)
ds = DATASET_MAP[ss_name]
extract_dirs = extraction_map[ss_name]
files = _get_local_paths(ds, extract_dirs)
if ss_name.startswith("czeng"):
if ss_name.endswith("16pre"):
sub_generator = functools.partial(
_parse_tsv, language_pair=("en", "cs"))
elif ss_name.endswith("17"):
filter_path = _get_local_paths(
_CZENG17_FILTER, extraction_map[_CZENG17_FILTER.name])[0]
sub_generator = functools.partial(
_parse_czeng, filter_path=filter_path)
else:
sub_generator = _parse_czeng
elif ss_name == "hindencorp_01":
sub_generator = _parse_hindencorp
elif len(files) == 2:
if ss_name.endswith("_frde"):
sub_generator = _parse_frde_bitext
else:
sub_generator = _parse_parallel_sentences
elif len(files) == 1:
fname = files[0]
# Note: Due to formatting used by `download_manager`, the file
# extension may not be at the end of the file path.
if ".tsv" in fname:
sub_generator = _parse_tsv
elif ss_name.startswith("newscommentary_v14"):
sub_generator = functools.partial(
_parse_tsv, language_pair=self.builder_config.language_pair)
elif "tmx" in fname:
sub_generator = _parse_tmx
elif ss_name.startswith("wikiheadlines"):
sub_generator = _parse_wikiheadlines
else:
raise ValueError("Unsupported file format: %s" % fname)
else:
raise ValueError("Invalid number of files: %d" % len(files))
for sub_key, ex in sub_generator(*files):
if not all(ex.values()):
continue
# TODO(adarob): Add subset feature.
# ex["subset"] = subset
key = "{}/{}".format(ss_name, sub_key)
yield key, ex