def read_conll_file()

in blink/candidate_retrieval/dataset.py [0:0]


def read_conll_file(data, path):
    conll = {}
    with open(path, "r", encoding="utf8") as f:
        cur_sent = None
        cur_doc = None

        for line in f:
            line = line.strip()
            if line.startswith("-DOCSTART-"):
                docname = line.split()[1][1:]
                conll[docname] = {"sentences": [], "mentions": []}
                cur_doc = conll[docname]
                cur_sent = []

            else:
                if line == "":
                    cur_doc["sentences"].append(cur_sent)
                    cur_sent = []

                else:
                    comps = line.split("\t")
                    tok = comps[0]
                    cur_sent.append(tok)

                    if len(comps) >= 6:
                        bi = comps[1]
                        wikilink = comps[4]
                        if bi == "I":
                            cur_doc["mentions"][-1]["end"] += 1
                        else:
                            new_ment = {
                                "sent_id": len(cur_doc["sentences"]),
                                "start": len(cur_sent) - 1,
                                "end": len(cur_sent),
                                "wikilink": wikilink,
                            }
                            cur_doc["mentions"].append(new_ment)

    # merge with data
    rmpunc = re.compile("[\W]+")
    for doc_name, content in data.items():
        conll_doc = conll[doc_name.split()[0]]
        content[0]["conll_doc"] = conll_doc

        cur_conll_m_id = 0
        for m in content:
            mention = m["mention"]
            gold = m["gold"]

            while True:
                cur_conll_m = conll_doc["mentions"][cur_conll_m_id]
                cur_conll_mention = " ".join(
                    conll_doc["sentences"][cur_conll_m["sent_id"]][
                        cur_conll_m["start"] : cur_conll_m["end"]
                    ]
                )
                if rmpunc.sub("", cur_conll_mention.lower()) == rmpunc.sub(
                    "", mention.lower()
                ):
                    m["conll_m"] = cur_conll_m
                    cur_conll_m_id += 1
                    break
                else:
                    cur_conll_m_id += 1

    return data