in blink/candidate_retrieval/dataset.py [0:0]
def read_conll_file(data, path):
conll = {}
with open(path, "r", encoding="utf8") as f:
cur_sent = None
cur_doc = None
for line in f:
line = line.strip()
if line.startswith("-DOCSTART-"):
docname = line.split()[1][1:]
conll[docname] = {"sentences": [], "mentions": []}
cur_doc = conll[docname]
cur_sent = []
else:
if line == "":
cur_doc["sentences"].append(cur_sent)
cur_sent = []
else:
comps = line.split("\t")
tok = comps[0]
cur_sent.append(tok)
if len(comps) >= 6:
bi = comps[1]
wikilink = comps[4]
if bi == "I":
cur_doc["mentions"][-1]["end"] += 1
else:
new_ment = {
"sent_id": len(cur_doc["sentences"]),
"start": len(cur_sent) - 1,
"end": len(cur_sent),
"wikilink": wikilink,
}
cur_doc["mentions"].append(new_ment)
# merge with data
rmpunc = re.compile("[\W]+")
for doc_name, content in data.items():
conll_doc = conll[doc_name.split()[0]]
content[0]["conll_doc"] = conll_doc
cur_conll_m_id = 0
for m in content:
mention = m["mention"]
gold = m["gold"]
while True:
cur_conll_m = conll_doc["mentions"][cur_conll_m_id]
cur_conll_mention = " ".join(
conll_doc["sentences"][cur_conll_m["sent_id"]][
cur_conll_m["start"] : cur_conll_m["end"]
]
)
if rmpunc.sub("", cur_conll_mention.lower()) == rmpunc.sub(
"", mention.lower()
):
m["conll_m"] = cur_conll_m
cur_conll_m_id += 1
break
else:
cur_conll_m_id += 1
return data