in kilt/datasets/entity_linking.py [0:0]
def process_chunk(self, lines, ks, chunk_id=-1):
kilt_records = []
# left context so far in the document
left_context = []
# working datapoints for the document
document_questions = []
# is the entity open
open_entity = False
# question id in the document
question_i = 0
for line in tqdm(lines):
if "-DOCSTART-" in line:
# new document is starting
doc_id = line.split("(")[-1][:-2]
# END DOCUMENT
# check end of entity
if open_entity:
open_entity = False
"""
#DEBUG
for q in document_questions:
pp.pprint(q)
input("...")
"""
# add sentence_questions to kilt_records
kilt_records.extend(
convert_to_KILT_format(
document_questions,
self.ks,
self.id_filter_positive,
self.id_filter_negative,
)
)
# reset
left_context = []
document_questions = []
question_i = 0
else:
split = line.split("\t")
token = split[0].strip()
if len(split) >= 5:
B_I = split[1]
mention = split[2]
# YAGO2_entity = split[3]
Wikipedia_URL = split[4]
Wikipedia_ID = split[5]
# Freee_base_id = split[6]
if B_I == "I":
pass
elif B_I == "B":
q = {
"id": "{}:{}".format(doc_id, question_i),
"mention": mention,
"Wikipedia_URL": Wikipedia_URL,
"Wikipedia_ID": Wikipedia_ID,
"left_context": left_context.copy(),
"right_context": [],
}
document_questions.append(q)
open_entity = True
question_i += 1
else:
print("Invalid B_I {}", format(B_I))
sys.exit(-1)
# print(token,B_I,mention,Wikipedia_URL,Wikipedia_ID)
else:
if open_entity:
open_entity = False
left_context.append(token)
for q in document_questions[:-1]:
q["right_context"].append(token)
if len(document_questions) > 0 and not open_entity:
document_questions[-1]["right_context"].append(token)
# FINAL SENTENCE
if open_entity:
open_entity = False
# add sentence_questions to kilt_records
kilt_records.extend(
convert_to_KILT_format(
document_questions,
self.ks,
self.id_filter_positive,
self.id_filter_negative,
)
)
return kilt_records, [] # no metadata