def process_chunk()

in kilt/datasets/entity_linking.py [0:0]


    def process_chunk(self, lines, ks, chunk_id=-1):

        kilt_records = []

        # left context so far in the document
        left_context = []

        # working datapoints for the document
        document_questions = []

        # is the entity open
        open_entity = False

        # question id in the document
        question_i = 0

        for line in tqdm(lines):

            if "-DOCSTART-" in line:
                # new document is starting

                doc_id = line.split("(")[-1][:-2]

                # END DOCUMENT

                # check end of entity
                if open_entity:
                    open_entity = False

                """
                #DEBUG
                for q in document_questions:
                    pp.pprint(q)
                    input("...")
                """

                # add sentence_questions to kilt_records
                kilt_records.extend(
                    convert_to_KILT_format(
                        document_questions,
                        self.ks,
                        self.id_filter_positive,
                        self.id_filter_negative,
                    )
                )

                # reset
                left_context = []
                document_questions = []
                question_i = 0

            else:
                split = line.split("\t")
                token = split[0].strip()

                if len(split) >= 5:
                    B_I = split[1]
                    mention = split[2]
                    #  YAGO2_entity = split[3]
                    Wikipedia_URL = split[4]
                    Wikipedia_ID = split[5]
                    # Freee_base_id = split[6]

                    if B_I == "I":
                        pass

                    elif B_I == "B":

                        q = {
                            "id": "{}:{}".format(doc_id, question_i),
                            "mention": mention,
                            "Wikipedia_URL": Wikipedia_URL,
                            "Wikipedia_ID": Wikipedia_ID,
                            "left_context": left_context.copy(),
                            "right_context": [],
                        }
                        document_questions.append(q)
                        open_entity = True
                        question_i += 1

                    else:
                        print("Invalid B_I {}", format(B_I))
                        sys.exit(-1)

                    # print(token,B_I,mention,Wikipedia_URL,Wikipedia_ID)
                else:
                    if open_entity:
                        open_entity = False

                left_context.append(token)

                for q in document_questions[:-1]:
                    q["right_context"].append(token)

                if len(document_questions) > 0 and not open_entity:
                    document_questions[-1]["right_context"].append(token)

        # FINAL SENTENCE
        if open_entity:
            open_entity = False

        # add sentence_questions to kilt_records
        kilt_records.extend(
            convert_to_KILT_format(
                document_questions,
                self.ks,
                self.id_filter_positive,
                self.id_filter_negative,
            )
        )

        return kilt_records, []  # no metadata