def learn_sentencepiece()

in access/preprocessors.py [0:0]


    def learn_sentencepiece(self):
        if self.sentencepiece_model_path.exists():
            return
        self.sentencepiece_model_path.parent.mkdir(parents=True, exist_ok=True)
        sentencepiece_model_prefix = self.sentencepiece_model_path.parent / self.sentencepiece_model_path.stem
        args_str = ' '.join([
            f'--input={",".join([str(path) for path in self.input_filepaths])}',
            f'--model_prefix={sentencepiece_model_prefix}',
            f'--vocab_size={self.vocab_size}',
        ])
        max_lines = 10**6
        if sum([count_lines(filepath) for filepath in self.input_filepaths]) > max_lines:
            args_str += f' --input_sentence_size={max_lines} --shuffle_input_sentence=true'
        spm.SentencePieceTrainer.Train(args_str)