in step8_pipeline_parallel_1f1b/dataloader.py [0:0]
def tokenizer_group_text(self, examples, tokenizer, sequence_length):
"""Tokenize a list of texts and group them in chunks of sequence_length + 1"""
tokenized_text_batch = tokenizer.batch_encode_plus(
examples,
return_attention_mask=False,
return_token_type_ids=False,
return_tensors='np'
)
concatenated_tokens = {'input_ids': np.concatenate(tokenized_text_batch['input_ids'])}
total_length = len(concatenated_tokens['input_ids'])
if total_length >= sequence_length + 1:
total_length = ((total_length - 1) // sequence_length) * sequence_length + 1
result = {
'input_ids': [
concatenated_tokens['input_ids'][i : i + sequence_length + 1]
for i in range(0, total_length - sequence_length, sequence_length)
]
}
return result