recipes/utilities/prepare_librispeech_official_lm.py (51 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Downloading and preparation of official Librispeech 4-gram language model. Please install `kenlm` on your own - https://github.com/kpu/kenlm Command : python3 prepare_librispeech_official_lm.py --dst [...] --kenlm [...]/kenlm/ Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os import re if __name__ == "__main__": parser = argparse.ArgumentParser(description="Librispeech official lm creation.") parser.add_argument( "--dst", help="data destination directory", default="./decoder" ) parser.add_argument("--kenlm", help="location to installed kenlm directory") args = parser.parse_args() decoder_path = args.dst os.makedirs(decoder_path, exist_ok=True) # Generating decoder/* lm = "4-gram" assert os.path.isdir(str(args.kenlm)), "kenlm directory not found - '{d}'".format( d=args.kenlm ) print("Downloading Librispeech official LM model...\n", flush=True) arpa_file = os.path.join(decoder_path, lm + ".arpa") if not os.path.exists(arpa_file): os.system( "wget -c -O - http://www.openslr.org/resources/11/{lm}.arpa.gz | " "gunzip -c > {fout}".format(lm=lm, fout=arpa_file) ) else: print("Arpa file {} exist, skip its downloading.".format(arpa_file)) # temporary arpa file in lowercase print("Saving ARPA LM file in binary format ...\n", flush=True) os.system( "cat {arpa} | tr '[:upper:]' '[:lower:]' > {arpa}.tmp".format(arpa=arpa_file) ) binary = os.path.join(args.kenlm, "build", "bin", "build_binary") os.system( "{bin} {farpa}.tmp {fbin}".format( bin=binary, farpa=arpa_file, fbin=arpa_file.replace(".arpa", ".bin") ) ) os.remove(os.path.join(arpa_file + ".tmp")) # prepare lexicon word -> tokens spelling # write words to lexicon.txt file lex_file = os.path.join(decoder_path, "lexicon.txt") print("Writing Lexicon file - {}...".format(lex_file)) with open(lex_file, "w") as f: # get all the words in the arpa file with open(arpa_file, "r") as arpa: for line in arpa: # verify if the line corresponds to unigram if not re.match(r"[-]*[0-9\.]+\t\S+\t*[-]*[0-9\.]*$", line): continue word = line.split("\t")[1] word = word.strip().lower() if word == "<unk>" or word == "<s>" or word == "</s>": continue assert re.match("^[a-z']+$", word), "invalid word - {w}".format(w=word) f.write("{w}\t{s} |\n".format(w=word, s=" ".join(word))) print("Done!", flush=True)