in data/datasets.py [0:0]
def _prepare_inputs_and_loss_mask(self, messages):
conv_ids = self.tokenizer.apply_chat_template(
messages,
tokenize=True,
add_special_tokens=False,
return_dict=True,
)
mask = [0] * len(conv_ids["input_ids"])
# Locate each assistant turn and flip its mask to 1
cursor = 0
for msg in messages:
segment_ids = self.tokenizer.apply_chat_template(
[msg], tokenize=True, add_special_tokens=False
)
seg_len = len(segment_ids)
if msg["role"] == "assistant":
start = cursor + self.prefix_len
end = cursor + seg_len
mask[start:end] = [1] * (end - start) # attend to these tokens
cursor += seg_len
return torch.tensor(conv_ids["input_ids"]), torch.tensor(mask).to(torch.bool), torch.tensor(conv_ids["attention_mask"])