def get_chunks()

in kilt/datasets/fact_verification.py [0:0]


    def get_chunks(self, num_chunks):

        # Read claims, create a set of wiki pages to
        # find the evidence sentences in
        page_to_evidence_sents = {}

        with open(self.claims_input_file, "r") as infile:
            for line in infile:
                claim = json.loads(line)

                if "verifiable" in claim and claim["verifiable"] == "NOT VERIFIABLE":
                    continue

                evidence_sets = claim["evidence"]
                for evidence_set in evidence_sets:

                    for evidence in evidence_set:
                        if evidence[2]:
                            page_id = unicodedata.normalize("NFKD", evidence[2])
                        else:
                            #  those can be filtered out/ignored. They’re an artefact of merging some of the duplicates where annotators disagreed over the label.
                            break

                        sent_id = int(evidence[3])

                        if page_id not in page_to_evidence_sents:
                            page_to_evidence_sents[page_id] = {}

                        page_to_evidence_sents[page_id][sent_id] = None

        for idx in range(1, 110):
            filename = self.evidence_directory_path + f"/wiki-{idx:03}.jsonl"
            print(f"processing filename {filename}")
            with open(filename, "r") as fin:
                for line in fin:
                    wiki_page = json.loads(line.strip())
                    page_id = wiki_page["id"]
                    if page_id not in page_to_evidence_sents:
                        continue
                    lines = wiki_page["lines"].split("\n")
                    sentences = []
                    for l in lines:
                        line_fields = l.split("\t")
                        # skip empty sentences
                        if len(line_fields) < 2 or line_fields[1] == "":
                            continue
                        # skip sentences where first element is not number
                        if not line_fields[0].isdigit():
                            continue

                        sent_text = line_fields[1]

                        # there is no id, so the new line character is
                        # likely a formatting error, will ignore and
                        # append the normalized text to the previous
                        # sentence.
                        if line_fields[0] == "":
                            sentences[-1]["text"] += " " + sent_text
                        else:
                            sentences.append(
                                {
                                    "id": line_fields[0],
                                    "text": sent_text,
                                }
                            )

                    for sentence in sentences:
                        sent_id = int(sentence["id"])
                        sent_text = sentence["text"]
                        if sent_id in page_to_evidence_sents[page_id]:
                            page_to_evidence_sents[page_id][sent_id] = sent_text

        data = []
        for page_id in page_to_evidence_sents:
            for sent_id in page_to_evidence_sents[page_id]:
                sent_text = page_to_evidence_sents[page_id][sent_id]
                data.append(
                    {
                        "page_id": page_id,
                        "sent_id": sent_id,
                        "text": sent_text,
                    }
                )

        n = len(data)
        print("{} examples in the dataset".format(n))
        return utils.chunk_it(data, num_chunks)