in datasets.py [0:0]
def compute_features(self, max_input_length: int, max_output_length: int, multitask: bool = False):
input_sentences = [self.input_format.format_input(example, multitask=multitask) for example in self.examples]
output_sentences = [self.output_format.format_output(example) for example in self.examples]
input_sentences = self.truncate_first_n_tokens(examples=input_sentences,
max_seq_length=max_input_length)
output_sentences = self.truncate_first_n_tokens(examples=output_sentences,
max_seq_length=max_output_length)
input_tok = self.tokenizer.batch_encode_plus(
input_sentences,
max_length=max_input_length,
return_tensors='pt',
padding='max_length',
truncation=True,
)
output_tok = self.tokenizer.batch_encode_plus(
output_sentences,
max_length=max_output_length,
return_tensors='pt',
padding='max_length',
truncation=True,
)
self._warn_max_sequence_length(max_input_length, input_sentences, "input")
self._warn_max_sequence_length(max_output_length, output_sentences, "output")
assert input_tok.input_ids.size(0) == output_tok.input_ids.size(0), print(
f'Size does not match: len(sentences_tok.input_ids)={len(input_tok.input_ids)}, '
f'len(labels_tok.input_ids)={len(output_tok.input_ids)}'
)
features = []
for sentence_input_ids, att_mask, label_input_ids in zip(input_tok.input_ids, input_tok.attention_mask,
output_tok.input_ids):
features.append(InputFeatures(
input_ids=sentence_input_ids.tolist(),
attention_mask=att_mask.tolist(),
label_ids=label_input_ids.tolist()
))
return features