in kilt/datasets/fact_verification.py [0:0]
def get_chunks(self, num_chunks):
# Read claims, create a set of wiki pages to
# find the evidence sentences in
page_to_evidence_sents = {}
with open(self.claims_input_file, "r") as infile:
for line in infile:
claim = json.loads(line)
if "verifiable" in claim and claim["verifiable"] == "NOT VERIFIABLE":
continue
evidence_sets = claim["evidence"]
for evidence_set in evidence_sets:
for evidence in evidence_set:
if evidence[2]:
page_id = unicodedata.normalize("NFKD", evidence[2])
else:
# those can be filtered out/ignored. They’re an artefact of merging some of the duplicates where annotators disagreed over the label.
break
sent_id = int(evidence[3])
if page_id not in page_to_evidence_sents:
page_to_evidence_sents[page_id] = {}
page_to_evidence_sents[page_id][sent_id] = None
for idx in range(1, 110):
filename = self.evidence_directory_path + f"/wiki-{idx:03}.jsonl"
print(f"processing filename {filename}")
with open(filename, "r") as fin:
for line in fin:
wiki_page = json.loads(line.strip())
page_id = wiki_page["id"]
if page_id not in page_to_evidence_sents:
continue
lines = wiki_page["lines"].split("\n")
sentences = []
for l in lines:
line_fields = l.split("\t")
# skip empty sentences
if len(line_fields) < 2 or line_fields[1] == "":
continue
# skip sentences where first element is not number
if not line_fields[0].isdigit():
continue
sent_text = line_fields[1]
# there is no id, so the new line character is
# likely a formatting error, will ignore and
# append the normalized text to the previous
# sentence.
if line_fields[0] == "":
sentences[-1]["text"] += " " + sent_text
else:
sentences.append(
{
"id": line_fields[0],
"text": sent_text,
}
)
for sentence in sentences:
sent_id = int(sentence["id"])
sent_text = sentence["text"]
if sent_id in page_to_evidence_sents[page_id]:
page_to_evidence_sents[page_id][sent_id] = sent_text
data = []
for page_id in page_to_evidence_sents:
for sent_id in page_to_evidence_sents[page_id]:
sent_text = page_to_evidence_sents[page_id][sent_id]
data.append(
{
"page_id": page_id,
"sent_id": sent_id,
"text": sent_text,
}
)
n = len(data)
print("{} examples in the dataset".format(n))
return utils.chunk_it(data, num_chunks)