def _generate_examples()

in tensorflow_datasets/question_answering/trivia_qa.py [0:0]


  def _generate_examples(self, files, web_dir, wiki_dir):
    """This function returns the examples."""

    def parse_example(article):
      """Return a single example from an article JSON record."""

      def _strip(collection):
        return [item.strip() for item in collection]

      if "Answer" in article:
        answer = article["Answer"]
        answer_dict = {
            "aliases":
                _strip(answer["Aliases"]),
            "normalized_aliases":
                _strip(answer["NormalizedAliases"]),
            "matched_wiki_entity_name":
                answer.get("MatchedWikiEntryName", "").strip(),
            "normalized_matched_wiki_entity_name":
                answer.get("NormalizedMatchedWikiEntryName", "").strip(),
            "normalized_value":
                answer["NormalizedValue"].strip(),
            "type":
                answer["Type"].strip(),
            "value":
                answer["Value"].strip(),
        }
      else:
        answer_dict = {
            "aliases": [],
            "normalized_aliases": [],
            "matched_wiki_entity_name": "<unk>",
            "normalized_matched_wiki_entity_name": "<unk>",
            "normalized_value": "<unk>",
            "type": "",
            "value": "<unk>",
        }

      if self.builder_config.exclude_context:
        article["SearchResults"] = []
        article["EntityPages"] = []

      def _add_context(collection, context_field, file_dir):
        """Adds context from file, or skips if file does not exist."""
        new_items = []
        for item in collection:
          if "Filename" not in item:
            logging.info("Missing context 'Filename', skipping.")
            continue

          new_item = item.copy()
          fname = item["Filename"]
          try:
            with tf.io.gfile.GFile(os.path.join(file_dir, fname)) as f:
              new_item[context_field] = f.read()
          except (IOError, tf.errors.NotFoundError):
            logging.info("File does not exist, skipping: %s", fname)
            continue
          new_items.append(new_item)
        return new_items

      def _strip_if_str(v):
        return v.strip() if isinstance(v, six.string_types) else v

      def _transpose_and_strip_dicts(dicts, field_names):
        return {
            tfds.core.naming.camelcase_to_snakecase(k):
            [_strip_if_str(d[k]) for d in dicts] for k in field_names
        }

      search_results = _transpose_and_strip_dicts(
          _add_context(
              article.get("SearchResults", []), "SearchContext", web_dir),
          ["Description", "Filename", "Rank", "Title", "Url", "SearchContext"])

      entity_pages = _transpose_and_strip_dicts(
          _add_context(article.get("EntityPages", []), "WikiContext", wiki_dir),
          ["DocSource", "Filename", "Title", "WikiContext"])

      question = article["Question"].strip()
      question_id = article["QuestionId"]
      question_source = article["QuestionSource"].strip()

      return {
          "entity_pages": entity_pages,
          "search_results": search_results,
          "question": question,
          "question_id": question_id,
          "question_source": question_source,
          "answer": answer_dict,
      }

    for filepath in files:
      logging.info("generating examples from = %s", filepath)
      fname = os.path.basename(filepath)

      with tf.io.gfile.GFile(filepath) as f:
        current_record = ""
        for line in f:
          if line == "        {\n":
            current_record = line
          elif line.startswith("        }"):  # Handles final record as well.
            article = json.loads(current_record + "}")
            current_record = ""
            example = parse_example(article)
            yield "%s_%s" % (fname, example["question_id"]), example
          else:
            current_record += line