in scripts/create_BLINK_benchmark_data.py [0:0]
def extract_questions(filename):
# all the datapoints
global_questions = []
# left context so far in the document
left_context = []
# working datapoints for the document
document_questions = []
# is the entity open
open_entity = False
# question id in the document
question_i = 0
with open(filename) as fin:
lines = fin.readlines()
for line in tqdm(lines):
if "-DOCSTART-" in line:
# new document is starting
doc_id = line.split("(")[-1][:-2]
# END DOCUMENT
# check end of entity
if open_entity:
document_questions[-1]["input"].append(END_ENT_TOKEN)
open_entity = False
"""
#DEBUG
for q in document_questions:
pp.pprint(q)
input("...")
"""
# add sentence_questions to global_questions
global_questions.extend(document_questions)
# reset
left_context = []
document_questions = []
question_i = 0
else:
split = line.split("\t")
token = split[0].strip()
if len(split) >= 5:
B_I = split[1]
mention = split[2]
# YAGO2_entity = split[3]
Wikipedia_URL = split[4]
Wikipedia_ID = split[5]
# Freee_base_id = split[6]
if B_I == "I":
pass
elif B_I == "B":
title = Wikipedia_URL.split("/")[-1].replace("_", " ")
if Wikipedia_ID == "000":
if Wikipedia_URL in url2id_cache:
pageid = url2id_cache[Wikipedia_URL]
else:
pageid = _get_pageid_from_api(title)
url2id_cache[Wikipedia_URL] = pageid
Wikipedia_ID = pageid
q = {
"id": "{}:{}".format(doc_id, question_i),
"input": left_context.copy() + [BEGIN_ENT_TOKEN],
"mention": mention,
"Wikipedia_title": title,
"Wikipedia_URL": Wikipedia_URL,
"Wikipedia_ID": Wikipedia_ID,
"left_context": left_context.copy(),
"right_context": [],
}
document_questions.append(q)
open_entity = True
question_i += 1
else:
print("Invalid B_I {}", format(B_I))
sys.exit(-1)
# print(token,B_I,mention,Wikipedia_URL,Wikipedia_ID)
else:
if open_entity:
document_questions[-1]["input"].append(END_ENT_TOKEN)
open_entity = False
left_context.append(token)
for q in document_questions:
q["input"].append(token)
for q in document_questions[:-1]:
q["right_context"].append(token)
if len(document_questions) > 0 and not open_entity:
document_questions[-1]["right_context"].append(token)
# FINAL SENTENCE
if open_entity:
document_questions[-1]["input"].append(END_ENT_TOKEN)
open_entity = False
# add sentence_questions to global_questions
global_questions.extend(document_questions)
return global_questions