in scripts/adapet/ADAPET/src/data/WSCReader.py [0:0]
def read_dataset(self, split=None, is_eval=False):
'''
Read the original dataset
:param split: partition of the
'''
file = self._get_file(split)
data = []
with open(file, 'r') as f_in:
for line in f_in.readlines():
json_string = json.loads(line)
text = json_string["text"]
pronoun, pronoun_idx = json_string["target"]["span2_text"], \
json_string["target"]["span2_index"]
noun, noun_idx = json_string["target"]["span1_text"], \
json_string["target"]["span1_index"]
idx = json_string["idx"]
if "label" in json_string:
lbl = json_string["label"]
else:
lbl = -1
words_text = text.split()
words_lower = text.lower().split()
words_noun = noun.lower().split()
noun_len = len(words_noun)
if words_lower[noun_idx:noun_idx + noun_len] != words_noun:
for offset in [-1, +1]:
if words_lower[noun_idx + offset:noun_idx + noun_len + offset] == words_noun:
noun_idx += offset
if words_lower[noun_idx:noun_idx + noun_len] != words_noun:
warnings.warn(f"Got '{words_lower[noun_idx:noun_idx + noun_len]}' but expected "
f"'{words_noun}' at index {noun_idx} for '{words_text}'")
if words_text[pronoun_idx] != pronoun:
for offset in [-1, +1]:
if words_text[pronoun_idx + offset] == pronoun:
pronoun_idx += offset
if words_text[pronoun_idx] != pronoun and words_text[pronoun_idx].startswith(pronoun):
words_text = words_text[:pronoun_idx] \
+ [words_text[pronoun_idx][:len(pronoun)], words_text[pronoun_idx][len(pronoun):]] \
+ words_text[pronoun_idx + 1:]
assert words_text[pronoun_idx] == pronoun, \
f"Got '{words_text[pronoun_idx]}' but expected '{pronoun}' at index {pronoun_idx} for '{words_text}'"
orig_text = ' '.join(words_text)
words_text[pronoun_idx] = '*' + words_text[pronoun_idx] + '*'
text = ' '.join(words_text)
len_noun = max(len(self.tokenizer(words_text[noun_idx], add_special_tokens=False)["input_ids"]), 1)
len_pronoun = max(len(self.tokenizer(orig_text[pronoun_idx], add_special_tokens=False)["input_ids"]), 1)
dict_input = {"text": text, "pronoun": pronoun, "orig_text": orig_text,
"idx": idx, "noun": noun, "pronoun_idx_first": pronoun_idx < noun_idx, "len_noun": len_noun, "len_pronoun": len_pronoun}
dict_output = {"lbl": lbl}
dict_input_output = {"input": dict_input, "output": dict_output}
if split == 'train' and lbl != True:
continue
data.append(dict_input_output)
data = np.asarray(data)
return data