recipes/sota/2019/lm/prepare_wp_data.py (26 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to prepare word-piece data for lm training Command : python3 prepare.py --data_src [...] --model_src [...] Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os import sentencepiece as spm if __name__ == "__main__": parser = argparse.ArgumentParser(description="LM data preparation.") parser.add_argument("--data_src", help="librispeech data") parser.add_argument("--model_src", help="model auxilary files directory") args = parser.parse_args() sp = spm.SentencePieceProcessor() sp.Load( os.path.join(args.model_src, "am", "librispeech-train-all-unigram-10000.model") ) for name, suffix in zip( ["librispeech-lm-norm.txt.lower.shuffle", "dev-clean.txt", "dev-other.txt"], ["train", "dev-clean", "dev-other"], ): with open(os.path.join(args.data_src, "text", name), "r") as fin, open( os.path.join(args.model_src, "decoder/lm_wp_10k." + suffix), "w" ) as fout: for line in fin: result = "" for word in line.strip().split(" "): wps = sp.NBestEncodeAsPieces(word, 1)[0] result += " ".join([w.replace("\u2581", "_") for w in wps]) + " " fout.write("{}\n".format(result.strip()))