in data_measurements/tokenize.py [0:0]
def do_tokenization(self):
"""
Tokenizes a Hugging Face dataset in the self.feature field.
:return: Hugging Face Dataset with tokenized text in self.tok_feature.
"""
sent_tokenizer = self.cvec.build_tokenizer()
def tokenize_batch(examples):
if self.lowercase:
tok_sent = {
self.tok_feature: [tuple(sent_tokenizer(text.lower())) for
text in examples[self.feature]]}
else:
tok_sent = {
self.tok_feature: [tuple(sent_tokenizer(text)) for text in
examples[self.feature]]}
return tok_sent
tokenized_dset = self.text_dset.map(
tokenize_batch,
batched=True
)
logs.info("Tokenized the dataset.")
return tokenized_dset