in paper/experiments/webnlg/preprocess.py [0:0]
def parse(in_file, classification_data, num_candidates=5, max_per_operation=2):
"""Parse the given file and update `classification_data` with the parsed data"""
tree = ET.parse(in_file)
root = tree.getroot()
entries = list(root.find("entries"))
items = []
for e, entry in enumerate(entries):
tripletsets = list(entry.find("modifiedtripleset").findall("mtriple")) + list(
entry.find("modifiedtripleset").findall("otriple")
)
tripletsets = [process_tripleset(x) for x in tripletsets]
modifiedtripleset = [x["text"] for x in tripletsets]
modifiedtripleset.sort()
mtripleset = entry.find("modifiedtripleset")
modtripleset = []
raw_tripleset = ""
for mtriple in mtripleset:
e1, pred, e2 = mtriple.text.split(" | ")
raw_tripleset += mtriple.text + " ||| "
modtripleset.append(Triple(cleanup(e1), pred, cleanup(e2)))
all_lex = entry.findall("lex")
for lex in all_lex:
sortedtripleset = ""
for sent in lex.find("sortedtripleset").findall("sentence"):
for x in sent.findall("striple"):
sortedtripleset += process_tripleset(x)["text"] + ", "
references = cleanup(lex.find("references"))
template = cleanup(lex.find("template"))
try:
text = lex.find("text").text
if not text:
print("empty text")
text = ""
continue
except:
print("exception text")
text = ""
continue
try:
template = lex.find("template").text
if not template:
print("empty template")
template = ""
continue
except:
print("exception template")
template = ""
continue
# preprocess distractors
subjects = [x["dict"]["subject"] for x in tripletsets]
objects = [x["dict"]["object"] for x in tripletsets]
predicates = [x["dict"]["predicate"] for x in tripletsets]
swapping_candidates = [subjects + objects]
cutting_candidates = [subjects + objects]
random_text = get_nearby_text(entries, e)
tripletset_str = " ; ".join(modifiedtripleset)
distractors, classification_items = get_distractors(
tripletset_str,
text,
swapping_candidates,
cutting_candidates,
random_text,
num_candidates=num_candidates,
max_per_operation=max_per_operation,
)
classification_data.extend(classification_items)
item = {
"raw_modifiedtripleset": raw_tripleset,
"modifiedtripleset": " ; ".join(modifiedtripleset),
"sortedtripleset": sortedtripleset,
"references": references,
"template": template,
"text": distractors + [text],
"num_triples": Path(in_file).parent.name,
"category": Path(in_file).name,
"category_type": "seen" if Path(in_file).name in seen_categories else "unseen",
}
items.append(item)
return items