def extract_questions()

in scripts/create_BLINK_benchmark_data.py [0:0]


def extract_questions(filename):

    # all the datapoints
    global_questions = []

    # left context so far in the document
    left_context = []

    # working datapoints for the document
    document_questions = []

    # is the entity open
    open_entity = False

    # question id in the document
    question_i = 0

    with open(filename) as fin:
        lines = fin.readlines()

        for line in tqdm(lines):

            if "-DOCSTART-" in line:
                # new document is starting

                doc_id = line.split("(")[-1][:-2]

                # END DOCUMENT

                # check end of entity
                if open_entity:
                    document_questions[-1]["input"].append(END_ENT_TOKEN)
                    open_entity = False

                """
                #DEBUG
                for q in document_questions:
                    pp.pprint(q)
                    input("...")
                """

                # add sentence_questions to global_questions
                global_questions.extend(document_questions)

                # reset
                left_context = []
                document_questions = []
                question_i = 0

            else:
                split = line.split("\t")
                token = split[0].strip()

                if len(split) >= 5:
                    B_I = split[1]
                    mention = split[2]
                    #  YAGO2_entity = split[3]
                    Wikipedia_URL = split[4]
                    Wikipedia_ID = split[5]
                    # Freee_base_id = split[6]

                    if B_I == "I":
                        pass

                    elif B_I == "B":

                        title = Wikipedia_URL.split("/")[-1].replace("_", " ")

                        if Wikipedia_ID == "000":

                            if Wikipedia_URL in url2id_cache:
                                pageid = url2id_cache[Wikipedia_URL]
                            else:

                                pageid = _get_pageid_from_api(title)
                                url2id_cache[Wikipedia_URL] = pageid
                            Wikipedia_ID = pageid

                        q = {
                            "id": "{}:{}".format(doc_id, question_i),
                            "input": left_context.copy() + [BEGIN_ENT_TOKEN],
                            "mention": mention,
                            "Wikipedia_title": title,
                            "Wikipedia_URL": Wikipedia_URL,
                            "Wikipedia_ID": Wikipedia_ID,
                            "left_context": left_context.copy(),
                            "right_context": [],
                        }
                        document_questions.append(q)
                        open_entity = True
                        question_i += 1

                    else:
                        print("Invalid B_I {}", format(B_I))
                        sys.exit(-1)

                    # print(token,B_I,mention,Wikipedia_URL,Wikipedia_ID)
                else:
                    if open_entity:
                        document_questions[-1]["input"].append(END_ENT_TOKEN)
                        open_entity = False

                left_context.append(token)
                for q in document_questions:
                    q["input"].append(token)

                for q in document_questions[:-1]:
                    q["right_context"].append(token)

                if len(document_questions) > 0 and not open_entity:
                    document_questions[-1]["right_context"].append(token)

    # FINAL SENTENCE
    if open_entity:
        document_questions[-1]["input"].append(END_ENT_TOKEN)
        open_entity = False

    # add sentence_questions to global_questions
    global_questions.extend(document_questions)

    return global_questions