in kilt/datasets/natural_questions.py [0:0]
def process_chunk(self, chunk, ks, chunk_id=-1):
missing_pages = 0.0
short_exact_match = 0.0
short_fuzzy_match = 0.0
n = len(chunk)
kilt_data = []
metadata = []
for idx, datapoint in enumerate(chunk):
# from standard to simplified format
if "document_text" not in datapoint:
# wget https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/text_utils.py
from text_utils import simplify_nq_example
datapoint = simplify_nq_example(datapoint)
print(
"t: {}, p: {:.2f} %, mp: {:.1f}, exact: {:.1f}, fuzzy: {:.1f}".format(
chunk_id,
round(idx * 100 / n, 2),
missing_pages,
short_exact_match,
short_fuzzy_match,
),
end="\r",
)
sys.stdout.flush()
url = datapoint["document_url"]
page = ks.get_page_from_url(url)
if not page:
print("ERROR, not page!")
missing_pages += 1
else:
# get and validate annotations
annotations = datapoint["annotations"]
kilt_record = {
# original data point id if available otherwise unique id
"id": datapoint["example_id"],
# question / claim / sentence
"input": datapoint["question_text"],
}
kilt_record_output = []
local_sem = 0.0
local_sfm = 0.0
for annotation in annotations:
if "short_answers" in annotation:
short_answers = annotation["short_answers"]
# scan all possible short answers
for answer_index in range(len(short_answers)):
s = short_answers[answer_index]["start_token"]
e = short_answers[answer_index]["end_token"]
short_answer = datapoint["document_text"].split()[s:e]
answer_span = " ".join(short_answer).strip()
(
paragraph_id,
start_character,
end_character,
bleu,
) = utils.match_answer(
answer_span, page, nlp=self.nlp, debug=False
)
kilt_record_output.append(
{
# answer in textual form
"answer": answer_span,
"provenance": [
# list of relevant WikipediaPages / Spans as provenance for the answer from the ks
{
"wikipedia_id": page[
"wikipedia_id"
], # *mandatory* - ID Wikipedia Page
"title": page[
"wikipedia_title"
], # *mandatory* - Title Wikipedia Page
"start_paragraph_id": paragraph_id, # start paragraph id with relevant info
"start_character": start_character,
"end_paragraph_id": paragraph_id, # end paragraph id
"end_character": end_character,
"bleu_score": bleu, # 1.0 when gold data is exactly matched, lower for fuzzy matches
"meta": { # dataset/task specific
"yes_no_answer": annotations[0][
"yes_no_answer"
],
"annotation_id": annotations[0][
"annotation_id"
],
},
}
],
}
)
if bleu == 1:
local_sem += 1
elif bleu < 1 and bleu >= 0:
local_sfm += 1
else:
print("ERROR: invalid bleu: {}".format(bleu))
sys.exit(-1)
if "long_answer" in annotation:
long_answer = annotation["long_answer"]
s = long_answer["start_token"]
e = long_answer["end_token"]
long_answer = datapoint["document_text"].split()[s:e]
answer_span = " ".join(long_answer).strip()
(
paragraph_id,
start_character,
end_character,
bleu,
) = utils.match_answer(
answer_span, page, nlp=self.nlp, debug=False
)
kilt_record_output.append(
{
# answer in textual form
"answer": answer_span,
"provenance": [
# list of relevant WikipediaPages / Spans as provenance for the answer from the ks
{
"wikipedia_id": page[
"wikipedia_id"
], # *mandatory* - ID Wikipedia Page
"title": page[
"wikipedia_title"
], # *mandatory* - Title Wikipedia Page
"start_paragraph_id": paragraph_id, # start paragraph id with relevant info
"start_character": start_character,
"end_paragraph_id": paragraph_id, # end paragraph id
"end_character": end_character,
"bleu_score": bleu, # 1.0 when gold data is exactly matched, lower for fuzzy matches
"meta": { # dataset/task specific
"yes_no_answer": annotations[0][
"yes_no_answer"
],
"annotation_id": annotations[0][
"annotation_id"
],
},
}
],
}
)
if bleu == 1:
local_sem += 1
elif bleu < 1 and bleu >= 0:
local_sfm += 1
else:
print("ERROR: invalid bleu: {}".format(bleu))
sys.exit(-1)
# update kilt data
kilt_record["output"] = kilt_record_output
kilt_data.append(kilt_record)
# average by answers per single question
# if len(short_answers) > 0:
# short_exact_match += local_sem / len(short_answers)
# short_fuzzy_match += local_sfm / len(short_answers)
metadata = [missing_pages, short_exact_match, short_fuzzy_match]
return kilt_data, metadata