in access/preprocessors.py [0:0]
def learn_sentencepiece(self):
if self.sentencepiece_model_path.exists():
return
self.sentencepiece_model_path.parent.mkdir(parents=True, exist_ok=True)
sentencepiece_model_prefix = self.sentencepiece_model_path.parent / self.sentencepiece_model_path.stem
args_str = ' '.join([
f'--input={",".join([str(path) for path in self.input_filepaths])}',
f'--model_prefix={sentencepiece_model_prefix}',
f'--vocab_size={self.vocab_size}',
])
max_lines = 10**6
if sum([count_lines(filepath) for filepath in self.input_filepaths]) > max_lines:
args_str += f' --input_sentence_size={max_lines} --shuffle_input_sentence=true'
spm.SentencePieceTrainer.Train(args_str)