in tensorflow_datasets/text/gem/gem.py [0:0]
def _generate_examples(self, filepath, set_name, filepaths=None, lang=None):
"""Yields examples."""
if self.builder_config.name == "common_gen":
with tf.io.gfile.GFile(filepath) as f:
if set_name.startswith("challenge"):
exples = json.load(f)
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
id_ = -1
i = -1
for row in f:
row = row.replace(", }", "}") # Fix possible JSON format error.
data = json.loads(row)
concepts = [word for word in data["concept_set"].split("#")]
if set_name == "train":
i += 1
for scene in data["scene"]:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"concept_set_id":
i,
"concepts":
concepts,
"target":
scene,
"references": [],
}
else:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"concept_set_id":
id_,
"concepts":
concepts,
"target":
"" if set_name == "test" else data["scene"][0],
"references": [] if set_name == "test" else data["scene"],
}
elif self.builder_config.name == "cs_restaurants":
with tf.io.gfile.GFile(filepath) as f:
if set_name.startswith("challenge"):
exples = json.load(f)
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
data = json.load(f)
for id_, instance in enumerate(data):
yield id_, {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"dialog_act": instance["da"],
"dialog_act_delexicalized": instance["delex_da"],
"target": instance["text"],
"target_delexicalized": instance["delex_text"],
"references": [] if set_name == "train" else [instance["text"]],
}
elif self.builder_config.name == "dart":
with tf.io.gfile.GFile(filepath) as f:
data = json.loads(f.read())
id_ = -1
i = -1
for example in data:
flat_tripleset = [" ".join(ex) for ex in example["tripleset"]]
if set_name == "train":
i += 1
for annotation in example["annotations"]:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"dart_id":
i,
"tripleset":
flat_tripleset,
"subtree_was_extended":
example.get("subtree_was_extended",
None), # some are missing.
"target_sources": [
annotation["source"]
for annotation in example["annotations"]
],
"target":
annotation["text"],
"references": [],
}
else:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"dart_id":
id_,
"tripleset":
flat_tripleset,
"subtree_was_extended":
example.get("subtree_was_extended",
None), # some are missing.
"target_sources": [
annotation["source"]
for annotation in example["annotations"]
],
"target":
example["annotations"][0]["text"]
if example["annotations"] else "",
"references": [
annotation["text"] for annotation in example["annotations"]
],
}
elif self.builder_config.name == "e2e_nlg":
with tf.io.gfile.GFile(filepath) as f:
if set_name.startswith("challenge"):
exples = json.load(f)
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
reader = csv.DictReader(f)
for id_, example in enumerate(reader):
yield id_, {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"meaning_representation": example["mr"],
"target": example["ref"],
"references": [] if set_name == "train" else [example["ref"]],
}
elif self.builder_config.name.startswith("mlsum"):
if set_name in ["train", "validation", "test", "challenge_test_covid"]:
if set_name == "challenge_test_covid":
bad_ids = {}
else:
bad_ids_dct = json.load(tf.io.gfile.GFile(filepaths))
bad_ids = dict((bad_url, True)
for _, bad_url in bad_ids_dct[f"{lang}-{set_name}"])
with tf.io.gfile.GFile(filepath) as f:
id_ = -1
for line in f:
data = json.loads(line)
if data["url"] in bad_ids:
continue
else:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"text":
data["text"],
"target":
data["summary"],
"references": []
if set_name == "train" else [data["summary"]],
"topic":
data["topic"],
"url":
data["url"],
"title":
data["title"],
"date":
data["date"],
}
else:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
elif self.builder_config.name == "schema_guided_dialog":
if "challenge" in set_name:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
examples = json.load(tf.io.gfile.GFile(filepath))[set_name]
for id_, example in enumerate(examples):
dialog_acts = []
for act_id, slot, values in example["da"]:
dialog_acts.append({
"act": act_id,
"slot": slot,
"values": values,
})
yield id_, {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"dialog_acts": dialog_acts,
"context": example["context"],
"dialog_id": example["dialog_id"],
"service": example["service"],
"turn_id": example["turn_ix"],
"prompt": example["prompt"],
"target": example["target"],
"references": [] if set_name == "train" else [example["target"]],
}
elif self.builder_config.name == "totto":
if "challenge" in set_name:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
with tf.io.gfile.GFile(filepath) as json_file:
json_list = list(json_file)
id_ = -1
i = -1
for json_str in json_list:
result = json.loads(json_str)
if set_name == "train":
i += 1
for sentence in result["sentence_annotations"]:
id_ += 1
response = {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"totto_id":
i,
"table_page_title":
result["table_page_title"],
"table_webpage_url":
result["table_webpage_url"],
"table_section_title":
result["table_section_title"],
"table_section_text":
result["table_section_text"],
"table":
result["table"],
"highlighted_cells":
result["highlighted_cells"],
"example_id":
str(result["example_id"]),
"overlap_subset":
"none",
"sentence_annotations": [sentence],
"references": [],
"target":
sentence["final_sentence"],
}
yield id_, response
else:
id_ += 1
response = {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"totto_id": id_,
"table_page_title": result["table_page_title"],
"table_webpage_url": result["table_webpage_url"],
"table_section_title": result["table_section_title"],
"table_section_text": result["table_section_text"],
"table": result["table"],
"highlighted_cells": result["highlighted_cells"],
"example_id": str(result["example_id"]),
"overlap_subset": str(result["overlap_subset"]),
}
response[
"sentence_annotations"] = [] if set_name == "test" else result[
"sentence_annotations"]
response["references"] = [
sentence["final_sentence"]
for sentence in response["sentence_annotations"]
]
if response["references"]:
response["target"] = response["references"][0]
else:
response["target"] = ""
yield id_, response
elif self.builder_config.name.startswith("web_nlg"):
if "challenge" in set_name:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
if not exple:
continue
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
with tf.io.gfile.GFile(filepath) as f:
examples = json.load(f)
id_ = -1
for example in examples["values"]:
if set_name == "train":
for target in example["target"]:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"input":
example["input"],
"target":
target,
"references": []
if set_name == "train" else example["target"],
"category":
example["category"],
"webnlg_id":
example["webnlg-id"],
}
else:
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"input":
example["input"],
"target":
example["target"][0] if example["target"] else "",
"references":
example["target"],
"category":
example["category"],
"webnlg_id":
example["webnlg-id"],
}
elif self.builder_config.name == "wiki_auto_asset_turk":
if set_name in ["train", "validation"]:
keys = [
"source",
"target",
]
with tf.io.gfile.GFile(filepath) as f:
for id_, line in enumerate(f):
values = line.strip().split("\t")
assert len(
values) == 2, f"Not enough fields in ---- {line} --- {values}"
example = dict([(k, val) for k, val in zip(keys, values)])
example["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
example[
"gem_parent_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
example["references"] = [] if set_name == "train" else [
example["target"]
]
yield id_, example
elif set_name == "test_turk":
examples = json.load(tf.io.gfile.GFile(filepath))
for id_, example in enumerate(examples):
example["gem_parent_id"] = example["gem_id"]
for k in ["source_id", "target_id"]:
if k in example:
del example[k]
yield id_, example
elif set_name == "test_asset":
files = [tf.io.gfile.GFile(f_name) for f_name in filepaths]
for id_, lines in enumerate(zip(*files)):
yield id_, {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"target": lines[1].strip(),
"source": lines[0].strip(),
"references": [line.strip() for line in lines[1:]],
}
else:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
for k in ["source_id", "target_id"]:
if k in exple:
del exple[k]
yield id_, exple
elif self.builder_config.name.startswith("wiki_lingua"):
with tf.io.gfile.GFile(os.path.join(filepath,
f"{set_name}.src.{lang}")) as f_in_ln:
with tf.io.gfile.GFile(os.path.join(filepath,
f"{set_name}.src.en")) as f_in_en:
with tf.io.gfile.GFile(
os.path.join(filepath, f"{set_name}.tgt.{lang}")) as f_out_ln:
with tf.io.gfile.GFile(
os.path.join(filepath, f"{set_name}.tgt.en")) as f_out_en:
for id_, (src_ln, src_en, tgt_ln, tgt_en) in enumerate(
zip(f_in_ln, f_in_en, f_out_ln, f_out_en)):
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"source_aligned": {
lang: src_ln.strip(),
"en": src_en.strip()
},
"target_aligned": {
lang: tgt_ln.strip(),
"en": tgt_en.strip()
},
"source":
src_ln.strip(),
"target":
tgt_en.strip(),
"references": []
if set_name == "train" else [tgt_en.strip()],
}
elif self.builder_config.name == "xsum":
if "challenge" in set_name:
if "covid" in set_name:
with tf.io.gfile.GFile(filepath) as f:
id_ = -1
for line in f:
data = json.loads(line)
id_ += 1
yield id_, {
"gem_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id":
f"{self.builder_config.name}-{set_name}-{id_}",
"xsum_id":
data["url"],
"document":
data["text"],
"target":
data["summary"],
"references": []
if set_name == "train" else [data["summary"]],
}
else:
exples = json.load(tf.io.gfile.GFile(filepath))
if isinstance(exples, dict):
assert len(exples) == 1, "multiple entries found"
exples = list(exples.values())[0]
for id_, exple in enumerate(exples):
exple["gem_parent_id"] = exple["gem_id"]
exple["gem_id"] = f"{self.builder_config.name}-{set_name}-{id_}"
yield id_, exple
else:
with tf.io.gfile.GFile(filepath) as f:
split_ids = json.load(f)
for id_, i in enumerate(split_ids[set_name]):
with tf.io.gfile.GFile(os.path.join(filepaths, i + ".summary")) as f:
text = "".join([
line for line in f.readlines()
if line not in _XSUM_REMOVE_LINES and line.strip()
])
segs = text.split("[SN]")
yield id_, {
"gem_id": f"{self.builder_config.name}-{set_name}-{id_}",
"gem_parent_id": f"{self.builder_config.name}-{set_name}-{id_}",
"xsum_id": i,
"document": segs[8].strip(),
"target": segs[6].strip(),
"references": [] if set_name == "train" else [segs[6].strip()],
}