def _generate_examples()

in tensorflow_datasets/translate/wmt.py [0:0]


  def _generate_examples(self, split_subsets, extraction_map):
    """Returns the examples in the raw (text) form."""
    source, _ = self.builder_config.language_pair

    def _get_local_paths(ds, extract_dirs):
      rel_paths = ds.get_path(source)
      if len(extract_dirs) == 1:
        extract_dirs = extract_dirs * len(rel_paths)
      return [
          os.path.join(ex_dir, rel_path) if rel_path else ex_dir
          for ex_dir, rel_path in zip(extract_dirs, rel_paths)
      ]

    for ss_name in split_subsets:
      logging.info("Generating examples from: %s", ss_name)
      ds = DATASET_MAP[ss_name]
      extract_dirs = extraction_map[ss_name]
      files = _get_local_paths(ds, extract_dirs)
      if ss_name.startswith("czeng"):
        if ss_name.endswith("16pre"):
          sub_generator = functools.partial(
              _parse_tsv, language_pair=("en", "cs"))
        elif ss_name.endswith("17"):
          filter_path = _get_local_paths(
              _CZENG17_FILTER, extraction_map[_CZENG17_FILTER.name])[0]
          sub_generator = functools.partial(
              _parse_czeng, filter_path=filter_path)
        else:
          sub_generator = _parse_czeng
      elif ss_name == "hindencorp_01":
        sub_generator = _parse_hindencorp
      elif len(files) == 2:
        if ss_name.endswith("_frde"):
          sub_generator = _parse_frde_bitext
        else:
          sub_generator = _parse_parallel_sentences
      elif len(files) == 1:
        fname = files[0]
        # Note: Due to formatting used by `download_manager`, the file
        # extension may not be at the end of the file path.
        if ".tsv" in fname:
          sub_generator = _parse_tsv
        elif ss_name.startswith("newscommentary_v14"):
          sub_generator = functools.partial(
              _parse_tsv, language_pair=self.builder_config.language_pair)
        elif "tmx" in fname:
          sub_generator = _parse_tmx
        elif ss_name.startswith("wikiheadlines"):
          sub_generator = _parse_wikiheadlines
        else:
          raise ValueError("Unsupported file format: %s" % fname)
      else:
        raise ValueError("Invalid number of files: %d" % len(files))

      for sub_key, ex in sub_generator(*files):
        if not all(ex.values()):
          continue
        # TODO(adarob): Add subset feature.
        # ex["subset"] = subset
        key = "{}/{}".format(ss_name, sub_key)
        yield key, ex