in blink/candidate_retrieval/dataset.py [0:0]
def __init__(self, path, person_path, conll_path, added_params):
if added_params["generate_ments_and_cands"]:
added_params["generate_cands"] = False
if added_params["generate_cands"] or added_params["generate_ments_and_cands"]:
added_params["cand_generator"] = get_candidate_generator(added_params)
print(added_params)
print("load csv")
self.train = read_csv_file(path + "/aida_train.csv", added_params)
self.testA = read_csv_file(path + "/aida_testA.csv", added_params)
self.testB = read_csv_file(path + "/aida_testB.csv", added_params)
self.ace2004 = read_csv_file(path + "/wned-ace2004.csv", added_params)
self.aquaint = read_csv_file(path + "/wned-aquaint.csv", added_params)
self.clueweb = read_csv_file(path + "/wned-clueweb.csv", added_params)
self.msnbc = read_csv_file(path + "/wned-msnbc.csv", added_params)
self.wikipedia = read_csv_file(path + "/wned-wikipedia.csv", added_params)
self.wikipedia.pop("Jiří_Třanovský Jiří_Třanovský", None)
print("process coref")
person_names = load_person_names(person_path)
with_coref(self.train, person_names)
with_coref(self.testA, person_names)
with_coref(self.testB, person_names)
with_coref(self.ace2004, person_names)
with_coref(self.aquaint, person_names)
with_coref(self.clueweb, person_names)
with_coref(self.msnbc, person_names)
with_coref(self.wikipedia, person_names)
print("load conll")
read_conll_file(self.train, conll_path + "/AIDA/aida_train.txt")
read_conll_file(self.testA, conll_path + "/AIDA/testa_testb_aggregate_original")
read_conll_file(self.testB, conll_path + "/AIDA/testa_testb_aggregate_original")
read_conll_file(
self.ace2004, conll_path + "/wned-datasets/ace2004/ace2004.conll"
)
read_conll_file(
self.aquaint, conll_path + "/wned-datasets/aquaint/aquaint.conll"
)
read_conll_file(self.msnbc, conll_path + "/wned-datasets/msnbc/msnbc.conll")
read_conll_file(
self.clueweb, conll_path + "/wned-datasets/clueweb/clueweb.conll"
)
read_conll_file(
self.wikipedia, conll_path + "/wned-datasets/wikipedia/wikipedia.conll"
)
if added_params["generate_cands"]:
print(
"Number of candidates not present in p_e_m originally, but present when lowercased",
len(added_params["cand_generator"].lower_org),
)
print(
"Number of candidates not present in p_e_m originally, but present in p_e_m_lower when lowercased ",
len(added_params["cand_generator"].lower_lower),
)