in kilt/datasets/hotpotqa.py [0:0]
def process_chunk(self, chunk, ks, chunk_id=-1):
missing_pages = 0.0
exact_match = 0.0
fuzzy_match = 0.0
n = len(chunk)
kilt_data = []
metadata = []
for idx, datapoint in enumerate(chunk):
print(
"t: {}, p: {:.2f} %, mp: {:.1f}, exact: {:.1f}, fuzzy: {:.1f}".format(
chunk_id,
round(idx * 100 / n, 2),
missing_pages,
exact_match,
fuzzy_match,
),
end="\r",
)
sys.stdout.flush()
kilt_record = {
# original data point id if available otherwise unique id
"id": datapoint["_id"],
# question / claim / sentence
"input": datapoint["question"],
# dataset/task specific
"meta": {"level": datapoint["level"], "type": datapoint["type"],},
}
kilt_record_provenance = []
local_missing_page = False
local_exact_match = True
for evidence in datapoint["supporting_facts"]:
title = evidence[0]
sent_id = evidence[1]
text = ""
try:
text = self.hotpotqa_ks[title]["text"][sent_id]
except IndexError as e:
print(
"\nIndexError: {}\ntitle:{}\nsent_id:{}\n".format(
e, title, sent_id
)
)
if self.get_only_original_evidence:
kilt_record_provenance.append(
{"text": text, "title": title, "sent_id": sent_id}
)
else:
pages = ks.get_pages_by_title(title)
if len(pages) == 0:
local_missing_page = True
break
bleu = -1
paragraph_id = -1
start_character = -1
end_character = -1
for page in pages:
# it is unlikely, but there could be multiple pages for a title (e.g., disambiguation)
if text and len(text) > 0:
(
local_paragraph_id,
local_start_character,
local_end_character,
local_bleu,
) = utils.match_answer(
text, page, nlp=self.nlp, debug=False
)
if local_bleu > bleu:
paragraph_id = local_paragraph_id
start_character = local_start_character
end_character = local_end_character
bleu = local_bleu
if bleu != 1.0:
local_exact_match = False
kilt_record_provenance.append(
# list of relevant WikipediaPages / Spans as provenance for the answer from the ks
{
"wikipedia_id": page[
"wikipedia_id"
], # *mandatory* - ID Wikipedia Page
"title": page[
"wikipedia_title"
], # *mandatory* - Title Wikipedia Page
"start_paragraph_id": paragraph_id, # start paragraph id with relevant info
"start_character": start_character,
"end_paragraph_id": paragraph_id, # end paragraph id
"end_character": end_character,
"bleu_score": bleu, # 1.0 when gold data is exactly matched, lower for fuzzy matches
}
)
if local_missing_page:
missing_pages += 1
continue
if local_exact_match:
exact_match += 1
else:
fuzzy_match += 1
kilt_record["output"] = [
{"answer": datapoint["answer"], "provenance": kilt_record_provenance}
]
kilt_data.append(kilt_record)
if self.debug:
pp = pprint.PrettyPrinter(indent=4)
print("original datapoint:")
pp.pprint(datapoint)
input("...")
print("kilt record:")
pp.pprint(kilt_record)
input("...")
metadata = [missing_pages, exact_match, fuzzy_match]
return kilt_data, metadata