def _generate_examples()

in tensorflow_datasets/text/gem/gem.py [0:0]
500 lines of code
106 McCabe index (conditional complexity)

  def _generate_examples(self, filepath, set_name, filepaths=None, lang=None):
    """Yields examples."""
    if self.builder_config.name == "common_gen":
      with tf.io.gfile.GFile(filepath) as f:
        if set_name.startswith("challenge"):
          exples = json.load(f)
          if isinstance(exples, dict):
            assert len(exples) == 1, "multiple entries found"
            exples = list(exples.values())[0]
          for id_, exple in enumerate(exples):
            if not exple:
              continue
            exple["gem_parent_id"] = exple["gem_id"]
            exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            yield id_, exple
        else:
          id_ = -1
          i = -1
          for row in f:
            row = row.replace(", }", "}")  # Fix possible JSON format error.
            data = json.loads(row)
            concepts = [word for word in data["concept_set"].split("#")]
            if set_name == "train":
              i += 1
              for scene in data["scene"]:
                id_ += 1
                yield id_, {
                    "gem_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "gem_parent_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "concept_set_id":
                        i,
                    "concepts":
                        concepts,
                    "target":
                        scene,
                    "references": [],
                }
            else:
              id_ += 1
              yield id_, {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "concept_set_id":
                      id_,
                  "concepts":
                      concepts,
                  "target":
                      "" if set_name == "test" else data["scene"][0],
                  "references": [] if set_name == "test" else data["scene"],
              }
    elif self.builder_config.name == "cs_restaurants":
      with tf.io.gfile.GFile(filepath) as f:
        if set_name.startswith("challenge"):
          exples = json.load(f)
          if isinstance(exples, dict):
            assert len(exples) == 1, "multiple entries found"
            exples = list(exples.values())[0]
          for id_, exple in enumerate(exples):
            if not exple:
              continue
            exple["gem_parent_id"] = exple["gem_id"]
            exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            yield id_, exple
        else:
          data = json.load(f)
          for id_, instance in enumerate(data):
            yield id_, {
                "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "dialog_act": instance["da"],
                "dialog_act_delexicalized": instance["delex_da"],
                "target": instance["text"],
                "target_delexicalized": instance["delex_text"],
                "references": [] if set_name == "train" else [instance["text"]],
            }
    elif self.builder_config.name == "dart":
      with tf.io.gfile.GFile(filepath) as f:
        data = json.loads(f.read())
        id_ = -1
        i = -1
        for example in data:
          flat_tripleset = [" ".join(ex) for ex in example["tripleset"]]
          if set_name == "train":
            i += 1
            for annotation in example["annotations"]:
              id_ += 1
              yield id_, {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "dart_id":
                      i,
                  "tripleset":
                      flat_tripleset,
                  "subtree_was_extended":
                      example.get("subtree_was_extended",
                                  None),  # some are missing.
                  "target_sources": [
                      annotation["source"]
                      for annotation in example["annotations"]
                  ],
                  "target":
                      annotation["text"],
                  "references": [],
              }
          else:
            id_ += 1
            yield id_, {
                "gem_id":
                    f"{self.builder_config.name}-{set_name}-{id_}",
                "gem_parent_id":
                    f"{self.builder_config.name}-{set_name}-{id_}",
                "dart_id":
                    id_,
                "tripleset":
                    flat_tripleset,
                "subtree_was_extended":
                    example.get("subtree_was_extended",
                                None),  # some are missing.
                "target_sources": [
                    annotation["source"]
                    for annotation in example["annotations"]
                ],
                "target":
                    example["annotations"][0]["text"]
                    if example["annotations"] else "",
                "references": [
                    annotation["text"] for annotation in example["annotations"]
                ],
            }
    elif self.builder_config.name == "e2e_nlg":
      with tf.io.gfile.GFile(filepath) as f:
        if set_name.startswith("challenge"):
          exples = json.load(f)
          if isinstance(exples, dict):
            assert len(exples) == 1, "multiple entries found"
            exples = list(exples.values())[0]
          for id_, exple in enumerate(exples):
            if not exple:
              continue
            exple["gem_parent_id"] = exple["gem_id"]
            exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            yield id_, exple
        else:
          reader = csv.DictReader(f)
          for id_, example in enumerate(reader):
            yield id_, {
                "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "meaning_representation": example["mr"],
                "target": example["ref"],
                "references": [] if set_name == "train" else [example["ref"]],
            }
    elif self.builder_config.name.startswith("mlsum"):
      if set_name in ["train", "validation", "test", "challenge_test_covid"]:
        if set_name == "challenge_test_covid":
          bad_ids = {}
        else:
          bad_ids_dct = json.load(tf.io.gfile.GFile(filepaths))
          bad_ids = dict((bad_url, True)
                         for _, bad_url in bad_ids_dct[f"{lang}-{set_name}"])
        with tf.io.gfile.GFile(filepath) as f:
          id_ = -1
          for line in f:
            data = json.loads(line)
            if data["url"] in bad_ids:
              continue
            else:
              id_ += 1
              yield id_, {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "text":
                      data["text"],
                  "target":
                      data["summary"],
                  "references": []
                                if set_name == "train" else [data["summary"]],
                  "topic":
                      data["topic"],
                  "url":
                      data["url"],
                  "title":
                      data["title"],
                  "date":
                      data["date"],
              }
      else:
        exples = json.load(tf.io.gfile.GFile(filepath))
        if isinstance(exples, dict):
          assert len(exples) == 1, "multiple entries found"
          exples = list(exples.values())[0]
        for id_, exple in enumerate(exples):
          if not exple:
            continue
          exple["gem_parent_id"] = exple["gem_id"]
          exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
          yield id_, exple
    elif self.builder_config.name == "schema_guided_dialog":
      if "challenge" in set_name:
        exples = json.load(tf.io.gfile.GFile(filepath))
        if isinstance(exples, dict):
          assert len(exples) == 1, "multiple entries found"
          exples = list(exples.values())[0]
        for id_, exple in enumerate(exples):
          if not exple:
            continue
          exple["gem_parent_id"] = exple["gem_id"]
          exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
          yield id_, exple
      else:
        examples = json.load(tf.io.gfile.GFile(filepath))[set_name]
        for id_, example in enumerate(examples):
          dialog_acts = []
          for act_id, slot, values in example["da"]:
            dialog_acts.append({
                "act": act_id,
                "slot": slot,
                "values": values,
            })
          yield id_, {
              "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
              "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
              "dialog_acts": dialog_acts,
              "context": example["context"],
              "dialog_id": example["dialog_id"],
              "service": example["service"],
              "turn_id": example["turn_ix"],
              "prompt": example["prompt"],
              "target": example["target"],
              "references": [] if set_name == "train" else [example["target"]],
          }
    elif self.builder_config.name == "totto":
      if "challenge" in set_name:
        exples = json.load(tf.io.gfile.GFile(filepath))
        if isinstance(exples, dict):
          assert len(exples) == 1, "multiple entries found"
          exples = list(exples.values())[0]
        for id_, exple in enumerate(exples):
          if not exple:
            continue
          exple["gem_parent_id"] = exple["gem_id"]
          exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
          yield id_, exple
      else:
        with tf.io.gfile.GFile(filepath) as json_file:
          json_list = list(json_file)
        id_ = -1
        i = -1
        for json_str in json_list:
          result = json.loads(json_str)
          if set_name == "train":
            i += 1
            for sentence in result["sentence_annotations"]:
              id_ += 1
              response = {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "totto_id":
                      i,
                  "table_page_title":
                      result["table_page_title"],
                  "table_webpage_url":
                      result["table_webpage_url"],
                  "table_section_title":
                      result["table_section_title"],
                  "table_section_text":
                      result["table_section_text"],
                  "table":
                      result["table"],
                  "highlighted_cells":
                      result["highlighted_cells"],
                  "example_id":
                      str(result["example_id"]),
                  "overlap_subset":
                      "none",
                  "sentence_annotations": [sentence],
                  "references": [],
                  "target":
                      sentence["final_sentence"],
              }
              yield id_, response
          else:
            id_ += 1
            response = {
                "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "totto_id": id_,
                "table_page_title": result["table_page_title"],
                "table_webpage_url": result["table_webpage_url"],
                "table_section_title": result["table_section_title"],
                "table_section_text": result["table_section_text"],
                "table": result["table"],
                "highlighted_cells": result["highlighted_cells"],
                "example_id": str(result["example_id"]),
                "overlap_subset": str(result["overlap_subset"]),
            }
            response[
                "sentence_annotations"] = [] if set_name == "test" else result[
                    "sentence_annotations"]
            response["references"] = [
                sentence["final_sentence"]
                for sentence in response["sentence_annotations"]
            ]
            if response["references"]:
              response["target"] = response["references"][0]
            else:
              response["target"] = ""
            yield id_, response
    elif self.builder_config.name.startswith("web_nlg"):
      if "challenge" in set_name:
        exples = json.load(tf.io.gfile.GFile(filepath))
        if isinstance(exples, dict):
          assert len(exples) == 1, "multiple entries found"
          exples = list(exples.values())[0]
        for id_, exple in enumerate(exples):
          if not exple:
            continue
          exple["gem_parent_id"] = exple["gem_id"]
          exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
          yield id_, exple
      else:
        with tf.io.gfile.GFile(filepath) as f:
          examples = json.load(f)
          id_ = -1
          for example in examples["values"]:
            if set_name == "train":
              for target in example["target"]:
                id_ += 1
                yield id_, {
                    "gem_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "gem_parent_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "input":
                        example["input"],
                    "target":
                        target,
                    "references": []
                                  if set_name == "train" else example["target"],
                    "category":
                        example["category"],
                    "webnlg_id":
                        example["webnlg-id"],
                }
            else:
              id_ += 1
              yield id_, {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "input":
                      example["input"],
                  "target":
                      example["target"][0] if example["target"] else "",
                  "references":
                      example["target"],
                  "category":
                      example["category"],
                  "webnlg_id":
                      example["webnlg-id"],
              }
    elif self.builder_config.name == "wiki_auto_asset_turk":
      if set_name in ["train", "validation"]:
        keys = [
            "source",
            "target",
        ]
        with tf.io.gfile.GFile(filepath) as f:
          for id_, line in enumerate(f):
            values = line.strip().split("\t")
            assert len(
                values) == 2, f"Not enough fields in ---- {line} --- {values}"
            example = dict([(k, val) for k, val in zip(keys, values)])
            example["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            example[
                "gem_parent_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            example["references"] = [] if set_name == "train" else [
                example["target"]
            ]
            yield id_, example
      elif set_name == "test_turk":
        examples = json.load(tf.io.gfile.GFile(filepath))
        for id_, example in enumerate(examples):
          example["gem_parent_id"] = example["gem_id"]
          for k in ["source_id", "target_id"]:
            if k in example:
              del example[k]
          yield id_, example
      elif set_name == "test_asset":
        files = [tf.io.gfile.GFile(f_name) for f_name in filepaths]
        for id_, lines in enumerate(zip(*files)):
          yield id_, {
              "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
              "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
              "target": lines[1].strip(),
              "source": lines[0].strip(),
              "references": [line.strip() for line in lines[1:]],
          }
      else:
        exples = json.load(tf.io.gfile.GFile(filepath))
        if isinstance(exples, dict):
          assert len(exples) == 1, "multiple entries found"
          exples = list(exples.values())[0]
        for id_, exple in enumerate(exples):
          exple["gem_parent_id"] = exple["gem_id"]
          exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
          for k in ["source_id", "target_id"]:
            if k in exple:
              del exple[k]
          yield id_, exple
    elif self.builder_config.name.startswith("wiki_lingua"):
      with tf.io.gfile.GFile(os.path.join(filepath,
                                          f"{set_name}.src.{lang}")) as f_in_ln:
        with tf.io.gfile.GFile(os.path.join(filepath,
                                            f"{set_name}.src.en")) as f_in_en:
          with tf.io.gfile.GFile(
              os.path.join(filepath, f"{set_name}.tgt.{lang}")) as f_out_ln:
            with tf.io.gfile.GFile(
                os.path.join(filepath, f"{set_name}.tgt.en")) as f_out_en:
              for id_, (src_ln, src_en, tgt_ln, tgt_en) in enumerate(
                  zip(f_in_ln, f_in_en, f_out_ln, f_out_en)):
                yield id_, {
                    "gem_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "gem_parent_id":
                        f"{self.builder_config.name}-{set_name}-{id_}",
                    "source_aligned": {
                        lang: src_ln.strip(),
                        "en": src_en.strip()
                    },
                    "target_aligned": {
                        lang: tgt_ln.strip(),
                        "en": tgt_en.strip()
                    },
                    "source":
                        src_ln.strip(),
                    "target":
                        tgt_en.strip(),
                    "references": []
                                  if set_name == "train" else [tgt_en.strip()],
                }
    elif self.builder_config.name == "xsum":
      if "challenge" in set_name:
        if "covid" in set_name:
          with tf.io.gfile.GFile(filepath) as f:
            id_ = -1
            for line in f:
              data = json.loads(line)
              id_ += 1
              yield id_, {
                  "gem_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "gem_parent_id":
                      f"{self.builder_config.name}-{set_name}-{id_}",
                  "xsum_id":
                      data["url"],
                  "document":
                      data["text"],
                  "target":
                      data["summary"],
                  "references": []
                                if set_name == "train" else [data["summary"]],
              }
        else:
          exples = json.load(tf.io.gfile.GFile(filepath))
          if isinstance(exples, dict):
            assert len(exples) == 1, "multiple entries found"
            exples = list(exples.values())[0]
          for id_, exple in enumerate(exples):
            exple["gem_parent_id"] = exple["gem_id"]
            exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
            yield id_, exple
      else:
        with tf.io.gfile.GFile(filepath) as f:
          split_ids = json.load(f)
        for id_, i in enumerate(split_ids[set_name]):
          with tf.io.gfile.GFile(os.path.join(filepaths, i + ".summary")) as f:
            text = "".join([
                line for line in f.readlines()
                if line not in _XSUM_REMOVE_LINES and line.strip()
            ])
            segs = text.split("[SN]")
            yield id_, {
                "gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
                "xsum_id": i,
                "document": segs[8].strip(),
                "target": segs[6].strip(),
                "references": [] if set_name == "train" else [segs[6].strip()],
            }