in step5_data_parallel_naive/dataloader.py [0:0]
def tokenize_dataset(self, dataset, text_column_name, sequence_length, num_proc):
"""Tokenize the dataset and group texts in chunks of sequence_length + 1"""
tokenizer_func = partial(
self.tokenizer_group_text,
tokenizer=self.tokenizer,
sequence_length=sequence_length
)
tokenized_dataset = dataset.map(
tokenizer_func,
input_columns=text_column_name,
remove_columns=dataset.column_names,
features=Features({
"input_ids": Sequence(feature=Value(dtype="int64"), length=sequence_length + 1)
}),
batched=True,
num_proc=num_proc,
load_from_cache_file=True, # Preprocess dataset only once and cache it
desc=f"Grouping texts in chunks of {sequence_length+1}",
)
return tokenized_dataset