in kilt/datasets/fact_verification.py [0:0]
def process_chunk(self, chunk, ks, chunk_id=-1):
missing_pages = 0.0
exact_match = 0.0
fuzzy_match = 0.0
n = len(chunk)
kilt_data = []
metadata = []
for idx, datapoint in enumerate(chunk):
print(
"t: {}, p: {:.2f} %, mp: {:.1f}, exact: {:.1f}, fuzzy: {:.1f}".format(
chunk_id,
round(idx * 100 / n, 2),
missing_pages,
exact_match,
fuzzy_match,
),
end="\r",
)
sys.stdout.flush()
page_id = datapoint["page_id"]
sent_id = datapoint["sent_id"]
text = datapoint["text"]
if not text or text == None or len(text) == 0:
continue
url = "https://en.wikipedia.org/wiki/" + self._normalize(
datapoint["page_id"]
)
page = ks.get_page_from_url(url)
if not page:
missing_pages += 1
else:
# get and validate evidence sentence
local_sem = 0.0
local_sfm = 0.0
kilt_record = {
# original data point id if available otherwise unique id
"page_id": page_id,
"sentence_id": sent_id,
"evidence_text": text,
}
kilt_record_output = []
paragraph_id, start_character, end_character, bleu = utils.match_answer(
text, page, nlp=self.nlp, debug=False
)
kilt_record_output.append(
{
# answer in textual form
"answer": text,
"provenance": [
# list of relevant WikipediaPages / Spans as provenance for the answer from the ks
{
"wikipedia_id": page[
"wikipedia_id"
], # *mandatory* - ID Wikipedia Page
"title": page[
"wikipedia_title"
], # *mandatory* - Title Wikipedia Page
"start_paragraph_id": paragraph_id, # start paragraph id with relevant info
"start_character": start_character,
"end_paragraph_id": paragraph_id, # end paragraph id
"end_character": end_character,
"bleu_score": bleu, # 1.0 when gold data is exactly matched, lower for fuzzy matches
"meta": { # dataset/task specific
"fever_page_id": page_id,
"fever_sentence_id": sent_id,
},
}
],
}
)
if bleu == 1:
local_sem += 1
elif bleu < 1 and bleu >= 0:
local_sfm += 1
else:
print("ERROR: invalid bleu: {}".format(bleu))
sys.exit(-1)
# update kilt data
kilt_record["output"] = kilt_record_output
kilt_data.append(kilt_record)
exact_match += local_sem # / len(short_answers)
fuzzy_match += local_sfm # / len(short_answers)
metadata = [missing_pages, exact_match, fuzzy_match]
return kilt_data, metadata