recipes/lexicon_free/librispeech/prepare.py (232 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to prepare recipe to train/eval model on Librispeech in wav2letter++ pipelines Command : python3 prepare.py --data_dst [...] --model_dst [...] --kenlm [...]/kenlm/ Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os import sys from collections import defaultdict FILE_DIR = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(FILE_DIR, "../utilities")) from utils import convert_words_to_letters_asg_rep2 if __name__ == "__main__": parser = argparse.ArgumentParser(description="Librispeech Dataset creation.") parser.add_argument( "--data_dst", help="data destination directory", default="./librispeech" ) parser.add_argument( "--model_dst", help="model auxilary files destination directory", default="./conv_glu_librispeech_char", ) parser.add_argument("--kenlm", help="location to installed kenlm directory") parser.add_argument( "-p", "--process", help="# of process for Multiprocessing", default=8, type=int ) args = parser.parse_args() os.system( "python3 {}/../../../data/librispeech/prepare.py --dst {} -p {}".format( os.path.dirname(os.path.abspath(__file__)), args.data_dst, args.process ) ) subpaths = { "train": ["train-clean-100", "train-clean-360", "train-other-500"], "dev": ["dev-clean", "dev-other"], "test": ["test-clean", "test-other"], } lists_path = os.path.join(args.data_dst, "lists") am_path = os.path.join(args.model_dst, "am") decoder_path = os.path.join(args.model_dst, "decoder") os.makedirs(am_path, exist_ok=True) os.makedirs(decoder_path, exist_ok=True) # Generating am/* print("Generating tokens.lst for acoustic model training", flush=True) with open(os.path.join(am_path, "tokens.lst"), "w") as fout: fout.write("|\n") fout.write("'\n") for alphabet in range(ord("a"), ord("z") + 1): fout.write(chr(alphabet) + "\n") print( "Generating lexicon.lst (word -> tokens) for acoustic model training", flush=True, ) word_dict = defaultdict(set) for key, names in subpaths.items(): for name in names: with open(os.path.join(lists_path, name + ".lst"), "r") as flist: for line in flist: transcription = line.strip().split(" ")[3:] word_dict[key].update(transcription) lexicon_words = sorted(word_dict["train"] | word_dict["dev"]) with open(os.path.join(am_path, "lexicon_train+dev.lst"), "w") as f: for word in lexicon_words: f.write( "{word}\t{tokens} |\n".format(word=word, tokens=" ".join(list(word))) ) # Prepare data for char lm training/evaluation if os.path.exists(os.path.join(decoder_path, "char_lm_data.train")): print( "Skip generation of {}. Please remove the file to regenerate it".format( os.path.join(decoder_path, "char_lm_data.train") ) ) else: convert_words_to_letters_asg_rep2( os.path.join(args.data_dst, "text/librispeech-lm-norm.txt.lower.shuffle"), os.path.join(decoder_path, "char_lm_data.train"), ) convert_words_to_letters_asg_rep2( os.path.join(args.data_dst, "text/dev-clean.txt"), os.path.join(decoder_path, "char_lm_data.dev-clean"), ) convert_words_to_letters_asg_rep2( os.path.join(args.data_dst, "text/dev-other.txt"), os.path.join(decoder_path, "char_lm_data.dev-other"), ) # Download official 4gram model and its lexicon cmd = [ "python3 {}/../../utilities/prepare_librispeech_official_lm.py", "--dst {}", "--kenlm {}", ] os.system( " ".join(cmd).format( os.path.dirname(os.path.abspath(__file__)), decoder_path, args.kenlm ) ) additional_set = { "bennydeck", "fibi", "moling", "balvastro", "hennerberg", "ambrosch", "quilter's", "yokul", "recuperations", "dowle", "buzzer's", "tarrinzeau", "bozzle's", "riverlike", "vendhya", "sprucewood", "macklewain", "macklewain's", "khosala", "derivatively", "gardar", "untrussing", "rathskellers", "telemetering", "drouet's", "sneffels", "glenarvan's", "congal's", "d'avrigny", "rangitata", "wahiti", "presty", "quinci", "troke", "westmere", "saknussemm", "dhourra", "irolg", "bozzle", "boolooroo", "collander", "finnacta", "canyou", "myrdals", "shimerdas", "impara", "synesius's", "brandd", "bennydeck's", "weiser", "noirtier", "verloc", "shimerda", "sudvestr", "frierson's", "bergez", "gwynplaine's", "breadhouse", "mulrady", "shampooer", "ossipon", "shoplets", "delectasti", "herbivore", "lacquey's", "pinkies", "theosophies", "razetta", "magazzino", "yundt", "testbridge", "officinale", "burgoynes", "novatians", "sandyseal", "chaba", "beenie", "congal", "doma", "brau", "mainhall", "verloc's", "zingiber", "vinos", "bush'", "yulka", "bambeday", "darfhulva", "olbinett", "gingle", "nicless", "stupirti", "ossipon's", "skint", "ruggedo's", "tishimingo", "ganny", "delaunay's", "tumble's", "birdikins", "hardwigg", "homoiousios", "docetes", "daguerreotypist", "satisfier", "heuchera", "parrishes", "homoousios", "trampe", "bhunda", "brion's", "fjordungr", "hurstwood", "corncakes", "abalone's", "libano", "scheiler", } with open(os.path.join(decoder_path, "lexicon.txt"), "a") as flex: for word in additional_set: flex.write("{}\t{}\n".format(word, " ".join(list(word)) + " |")) os.rename( os.path.join(decoder_path, "lexicon.txt"), os.path.join(decoder_path, "lexicon.lst"), ) # prepare oov and in vocabulary samples lists decoder_lexicon_words = [] with open(os.path.join(decoder_path, "lexicon.lst"), "r") as flex: for line in flex: decoder_lexicon_words.append(line.strip().split("\t")[0]) decoder_lexicon_words = set(decoder_lexicon_words) for list_name in ["test-clean.lst", "test-other.lst"]: with open(os.path.join(lists_path, list_name), "r") as flist, open( os.path.join(decoder_path, list_name + ".oov"), "w" ) as foov, open(os.path.join(decoder_path, list_name + ".inv"), "w") as finv: for line in flist: sample_words = set(line.strip().split(" ")[3:]) if len(sample_words - decoder_lexicon_words) > 0: foov.write(line) else: finv.write(line) print("Done!", flush=True)