in src/tokenizer.py [0:0]
def _generate_model(self):
"""
Creates a dataset of processed text files, and trains a sentence piece
model.
"""
dir_path = "cache/sp_tokenizer_data"
if not os.path.exists(dir_path):
os.mkdir(dir_path)
self._process_data_files(dir_path)
data_files = list_files(dir_path)
model_cache_prefix = f'cache/sp_tokenizer_{self.vocab_limit}'
spm.SentencePieceTrainer.train(input=data_files,
model_prefix=model_cache_prefix,
vocab_size=self.vocab_limit,
bos_id=BASE_TOKENS["<s>"],
eos_id=BASE_TOKENS["</s>"],
unk_id=BASE_TOKENS["<unk>"],
pad_id=BASE_TOKENS["<pad>"])
model = spm.SentencePieceProcessor(model_file=f'{model_cache_prefix}.model')
return model