def process_chunk()

in kilt/datasets/hotpotqa.py [0:0]


    def process_chunk(self, chunk, ks, chunk_id=-1):

        missing_pages = 0.0
        exact_match = 0.0
        fuzzy_match = 0.0
        n = len(chunk)
        kilt_data = []
        metadata = []
        for idx, datapoint in enumerate(chunk):
            print(
                "t: {}, p: {:.2f} %, mp: {:.1f}, exact: {:.1f}, fuzzy: {:.1f}".format(
                    chunk_id,
                    round(idx * 100 / n, 2),
                    missing_pages,
                    exact_match,
                    fuzzy_match,
                ),
                end="\r",
            )
            sys.stdout.flush()

            kilt_record = {
                # original data point id if available otherwise unique id
                "id": datapoint["_id"],
                # question / claim / sentence
                "input": datapoint["question"],
                # dataset/task specific
                "meta": {"level": datapoint["level"], "type": datapoint["type"],},
            }
            kilt_record_provenance = []

            local_missing_page = False
            local_exact_match = True
            for evidence in datapoint["supporting_facts"]:
                title = evidence[0]
                sent_id = evidence[1]
                text = ""
                try:
                    text = self.hotpotqa_ks[title]["text"][sent_id]
                except IndexError as e:
                    print(
                        "\nIndexError: {}\ntitle:{}\nsent_id:{}\n".format(
                            e, title, sent_id
                        )
                    )

                if self.get_only_original_evidence:
                    kilt_record_provenance.append(
                        {"text": text, "title": title, "sent_id": sent_id}
                    )

                else:
                    pages = ks.get_pages_by_title(title)
                    if len(pages) == 0:
                        local_missing_page = True
                        break

                    bleu = -1
                    paragraph_id = -1
                    start_character = -1
                    end_character = -1
                    for page in pages:
                        # it is unlikely, but there could be multiple pages for a title (e.g., disambiguation)
                        if text and len(text) > 0:
                            (
                                local_paragraph_id,
                                local_start_character,
                                local_end_character,
                                local_bleu,
                            ) = utils.match_answer(
                                text, page, nlp=self.nlp, debug=False
                            )

                            if local_bleu > bleu:
                                paragraph_id = local_paragraph_id
                                start_character = local_start_character
                                end_character = local_end_character
                                bleu = local_bleu

                    if bleu != 1.0:
                        local_exact_match = False

                    kilt_record_provenance.append(
                        # list of relevant WikipediaPages / Spans as provenance for the answer from the ks
                        {
                            "wikipedia_id": page[
                                "wikipedia_id"
                            ],  # *mandatory* - ID Wikipedia Page
                            "title": page[
                                "wikipedia_title"
                            ],  # *mandatory* - Title Wikipedia Page
                            "start_paragraph_id": paragraph_id,  # start paragraph id with relevant info
                            "start_character": start_character,
                            "end_paragraph_id": paragraph_id,  # end paragraph id
                            "end_character": end_character,
                            "bleu_score": bleu,  # 1.0 when gold data is exactly matched, lower for fuzzy matches
                        }
                    )

            if local_missing_page:
                missing_pages += 1
                continue
            if local_exact_match:
                exact_match += 1
            else:
                fuzzy_match += 1

            kilt_record["output"] = [
                {"answer": datapoint["answer"], "provenance": kilt_record_provenance}
            ]
            kilt_data.append(kilt_record)

            if self.debug:
                pp = pprint.PrettyPrinter(indent=4)
                print("original datapoint:")
                pp.pprint(datapoint)
                input("...")
                print("kilt record:")
                pp.pprint(kilt_record)
                input("...")

        metadata = [missing_pages, exact_match, fuzzy_match]
        return kilt_data, metadata