def _generate

def _generate_model()

in src/tokenizer.py [0:0]

16 lines of code
2 McCabe index (conditional complexity)


    def _generate_model(self):
        """
        Creates a dataset of processed text files, and trains a sentence piece
        model.
        """
        dir_path = "cache/sp_tokenizer_data"
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
            self._process_data_files(dir_path)
        data_files = list_files(dir_path)

        model_cache_prefix = f'cache/sp_tokenizer_{self.vocab_limit}'
        spm.SentencePieceTrainer.train(input=data_files,
                                       model_prefix=model_cache_prefix,
                                       vocab_size=self.vocab_limit,
                                       bos_id=BASE_TOKENS["<s>"],
                                       eos_id=BASE_TOKENS["</s>"],
                                       unk_id=BASE_TOKENS["<unk>"],
                                       pad_id=BASE_TOKENS["<pad>"])
        model = spm.SentencePieceProcessor(model_file=f'{model_cache_prefix}.model')
        return model