in muss/kenlm.py [0:0]
def train_kenlm_language_model(input_data_paths, output_model_dir):
output_model_dir = Path(output_model_dir)
output_model_dir.mkdir(exist_ok=True, parents=True)
output_model_path = output_model_dir / 'kenlm_model.arpa'
with log_action('Training tokenizer'):
tokenizer = SentencePieceBPETokenizer()
tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000)
tokenizer.save(str(output_model_dir), 'spm_tokenizer')
with log_action('Tokenizing'):
tokenized_data_paths = get_temp_filepaths(len(input_data_paths))
for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths):
encodings = tokenizer.encode_batch(read_lines(input_data_path))
write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path)
with log_action('Training language model'):
kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ')
command = (
f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}'
)
run_command(command, mute=False)
[path.unlink() for path in tokenized_data_paths]
return output_model_dir