recipes/self_training/librispeech/lm/prepare_seq2seq_dict.py (73 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to prepare dictionary for running experiments with Librispeech datasets in wav2letter++ pipelines Please run prepare_data.py first to generate all the required file lists. Please make sure sentencepiece (https://github.com/google/sentencepiece) is installed. Command : python3 prepare_seq2seq_dict.py --src [...] --dst [...] Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os import sys import sentencepiece as spm import utils if __name__ == "__main__": parser = argparse.ArgumentParser(description="Librispeech dictionary creation.") parser.add_argument("--src", help="source directory (where *.lst files are)") parser.add_argument("--dst", help="destination directory", default="./librispeech") args = parser.parse_args() filelists = { "train": [ "train-clean-100", # "train-clean-360", # "train-other-500" ], "dev": ["dev-clean", "dev-other"], } num_wordpieces = 5000 nbest = 10 prefix = "librispeech-train-all-unigram-{}".format(num_wordpieces) prefix = os.path.join(args.dst, prefix) textfile = os.path.join(args.dst, "train-all.text") model = prefix + ".model" vocab = prefix + ".vocab" # prepare data sys.stdout.write("preparing data...\n") sys.stdout.flush() train_text = utils.read_list(args.src, filelists["train"]) dev_text = utils.read_list(args.src, filelists["dev"]) with open(textfile, "w") as f: for line in train_text: f.write(line) f.write("\n") word_dict = set() for line in train_text + dev_text: words = line.split() for w in words: word_dict.add(w) word_dict = sorted(word_dict) # train sys.stdout.write("computing word pieces...\n") sys.stdout.flush() train_cmd = "--input={input} --model_prefix={prefix} --vocab_size={sz} ".format( input=textfile, prefix=prefix, sz=num_wordpieces ) train_cmd = ( train_cmd + "--character_coverage=1.0 --model_type=unigram --split_by_unicode_script=false" ) spm.SentencePieceTrainer.Train(train_cmd) # word piece dictionary sys.stdout.write("creating word piece list...\n") exclude_list = {"<unk>", "<s>", "</s>"} with open(vocab + "-filtered", "w") as o: with open(vocab, "r") as f: for line in f: v, _ = line.strip().split("\t", 1) if v not in exclude_list: o.write(v.replace("\u2581", "_")) o.write("\n") # word -> word piece lexicon for loading targets sys.stdout.write("creating word -> word pieces lexicon...\n") sys.stdout.flush() sp = spm.SentencePieceProcessor() sp.Load(model) outfile = "librispeech-train+dev-unigram-{sz}-nbest{n}.dict".format( sz=num_wordpieces, n=nbest ) with open(os.path.join(args.dst, outfile), "w") as f: for word in word_dict: wps = sp.NBestEncodeAsPieces(word, nbest) for wp in wps: f.write(word) for w in wp: f.write(" " + w.replace("\u2581", "_")) f.write("\n") sys.stdout.write("Done !\n")