def read_dataset()

in scripts/adapet/ADAPET/src/data/WSCReader.py [0:0]


    def read_dataset(self, split=None, is_eval=False):
        '''
        Read the original dataset

        :param split: partition of the
        '''

        file = self._get_file(split)
        data = []

        with open(file, 'r') as f_in:
            for line in f_in.readlines():
                json_string = json.loads(line)

                text = json_string["text"]
                pronoun, pronoun_idx = json_string["target"]["span2_text"], \
                                       json_string["target"]["span2_index"]
                noun, noun_idx = json_string["target"]["span1_text"], \
                                 json_string["target"]["span1_index"]
                idx = json_string["idx"]

                if "label" in json_string:
                    lbl = json_string["label"]
                else:
                    lbl = -1

                words_text = text.split()
                words_lower = text.lower().split()
                words_noun = noun.lower().split()
                noun_len = len(words_noun)

                if words_lower[noun_idx:noun_idx + noun_len] != words_noun:
                    for offset in [-1, +1]:
                        if words_lower[noun_idx + offset:noun_idx + noun_len + offset] == words_noun:
                            noun_idx += offset


                if words_lower[noun_idx:noun_idx + noun_len] != words_noun:
                    warnings.warn(f"Got '{words_lower[noun_idx:noun_idx + noun_len]}' but expected "
                                   f"'{words_noun}' at index {noun_idx} for '{words_text}'")

                if words_text[pronoun_idx] != pronoun:
                    for offset in [-1, +1]:
                        if words_text[pronoun_idx + offset] == pronoun:
                            pronoun_idx += offset

                    if words_text[pronoun_idx] != pronoun and words_text[pronoun_idx].startswith(pronoun):
                        words_text = words_text[:pronoun_idx] \
                                  + [words_text[pronoun_idx][:len(pronoun)], words_text[pronoun_idx][len(pronoun):]] \
                                  + words_text[pronoun_idx + 1:]

                assert words_text[pronoun_idx] == pronoun, \
                    f"Got '{words_text[pronoun_idx]}' but expected '{pronoun}' at index {pronoun_idx} for '{words_text}'"

                orig_text = ' '.join(words_text)
                words_text[pronoun_idx] = '*' + words_text[pronoun_idx] + '*'
                text = ' '.join(words_text)

                len_noun = max(len(self.tokenizer(words_text[noun_idx], add_special_tokens=False)["input_ids"]), 1)
                len_pronoun = max(len(self.tokenizer(orig_text[pronoun_idx], add_special_tokens=False)["input_ids"]), 1)

                dict_input = {"text": text, "pronoun": pronoun, "orig_text": orig_text,
                              "idx": idx, "noun": noun, "pronoun_idx_first": pronoun_idx < noun_idx, "len_noun": len_noun, "len_pronoun": len_pronoun}

                dict_output = {"lbl": lbl}
                dict_input_output = {"input": dict_input, "output": dict_output}

                if split == 'train' and lbl != True:
                    continue
                data.append(dict_input_output)

        data = np.asarray(data)
        return data