in training/flax/run_pseudo_labelling_pt.py [0:0]
def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
model_input_name = self.processor.model_input_names[0]
# dataloader returns a list of features which we convert to a dict
input_features = {model_input_name: [feature[model_input_name] for feature in features]}
label_features = {"input_ids": [feature["labels"] for feature in features]}
file_ids = {"input_ids": [feature["file_id"] for feature in features]}
# reformat list to dict and set to pytorch format
batch = self.processor.feature_extractor.pad(
input_features,
padding=self.input_padding,
return_tensors="pt",
)
labels_batch = self.processor.tokenizer.pad(
label_features,
max_length=self.max_target_length,
padding=self.target_padding,
return_tensors="pt",
)
file_ids_batch = self.processor.tokenizer.pad(
file_ids,
max_length=self.max_target_length,
padding=self.target_padding,
return_tensors="pt",
)
# replace padding with -100 to ignore correctly when computing the loss
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if set(torch.unique(labels[:, 0])).issubset({self.decoder_start_token_id, self.decoder_prev_token_id}):
labels = labels[:, 1:]
# replace initial prompt tokens with -100 to ignore correctly when computing the loss
bos_index = torch.argmax((labels == self.decoder_start_token_id).long(), dim=1)
prompt_mask = torch.arange(labels.shape[1]) < bos_index[:, None]
labels = torch.where(prompt_mask, -100, labels)
batch["labels"] = labels
batch["file_ids"] = file_ids_batch["input_ids"]
return batch