recipes/lexicon_free/wsj/prepare.py

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to prepare recipe to train/eval model on Librispeech in wav2letter++ pipelines Please install `sph2pipe` on your own - see https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools \ with commands : wget https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz tar -xzf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 gcc -o sph2pipe *.c -lm Command : python3 prepare_data.py --wsj0 [...]/WSJ0/media \ --wsj1 [...]/WSJ1/media --data_dst [...] --model_dst [...] --sph2pipe [...]/sph2pipe_v2.5/sph2pipe Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import functools import os import re import sys from collections import defaultdict import numpy FILE_DIR = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(FILE_DIR, "../utilities")) from utils import convert_words_to_letters_asg_rep2 # FILE = __file__ def compare(x, y): # sort by counts, if counts equal then sort in lex order if x[1] > y[1]: return -1 elif x[1] == y[1]: if x[0] < y[0]: return -1 else: return 1 else: return 1 def remap_words_with_same_spelling(data_dst, decoder_dst): words_dict = defaultdict(int) spellings_dict = defaultdict(set) spellings_appearence_dict = defaultdict(int) with open(os.path.join(data_dst, "lists/si284.lst"), "r") as flist: for line in flist: for word in line.strip().split(" ")[3:]: word = re.sub(r"$\S+$", "", word) # not pronounced words_dict[word] += 1 spelling = re.sub("[^a-z'.]+", "", word) spellings_dict[spelling].update([word]) spellings_appearence_dict[spelling] += 1 with open(os.path.join(data_dst, "text/lm.txt"), "r") as flm: for line in flm: for word in line.strip().split(" "): word = re.sub(r"$\S+$", "", word) # not pronounced spelling = re.sub("[^a-z'.]+", "", word) spellings_dict[spelling].update([word]) spellings_appearence_dict[spelling] += 1 sorted_spellings = sorted( spellings_appearence_dict.items(), key=functools.cmp_to_key(compare) ) special_mapping = {"al": "al-", "st": "st", "nd": "nd", "rd": "rd"} remap_result = dict() with open(os.path.join(decoder_dst, "dict-remap.txt"), "w") as fmap: for spelling, _ in sorted_spellings: words_count = {w: words_dict[w] for w in spellings_dict[spelling]} sorted_words = sorted( words_count.items(), key=functools.cmp_to_key(compare) ) for word, _ in sorted_words: remap_result[word] = ( sorted_words[0][0] if spelling not in special_mapping else special_mapping[spelling] ) fmap.write("{} {}\n".format(word, remap_result[word])) return remap_result def get_spelling(word): spelling = re.sub(r"$\S+$", "", word) # not pronounced spelling = re.sub("[^a-z'.]+", "", spelling) return spelling if __name__ == "__main__": parser = argparse.ArgumentParser(description="Librispeech Dataset creation.") parser.add_argument("--wsj0", help="top level directory containing all WSJ0 discs") parser.add_argument("--wsj1", help="top level directory containing all WSJ1 discs") parser.add_argument( "--data_dst", help="data destination directory", default="./wsj" ) parser.add_argument( "--model_dst", help="model auxilary files destination directory", default="./conv_glu_librispeech_char", ) parser.add_argument( "--wsj1_type", help="if you are using larger corpus LDC94S13A, set parameter to `LDC94S13A`", default="LDC94S13B", ) parser.add_argument( "--sph2pipe", help="path to sph2pipe executable", default="./sph2pipe_v2.5/sph2pipe", ) parser.add_argument("--kenlm", help="location to installed kenlm directory") parser.add_argument( "-p", "--process", help="# of process for Multiprocessing", default=8, type=int ) args = parser.parse_args() os.system( "python3 {}/../../../data/wsj/prepare.py " "--wsj0 {} --wsj1 {} --sph2pipe {} --wsj1_type {} --dst {} -p {}".format( os.path.dirname(os.path.abspath(__file__)), args.wsj0, args.wsj1, args.sph2pipe, args.wsj1_type, args.data_dst, args.process, ) ) lists_path = os.path.join(args.data_dst, "lists") am_path = os.path.join(args.model_dst, "am") lm_data_path = os.path.join(args.data_dst, "text/lm.txt") decoder_path = os.path.join(args.model_dst, "decoder") os.makedirs(am_path, exist_ok=True) os.makedirs(decoder_path, exist_ok=True) # Generating am/* print("Generating tokens.lst for acoustic model training", flush=True) with open(os.path.join(am_path, "tokens.lst"), "w") as f_tokens: f_tokens.write("|\n") f_tokens.write("'\n") f_tokens.write(".\n") for alphabet in range(ord("a"), ord("z") + 1): f_tokens.write(chr(alphabet) + "\n") print( "Generating lexicon.lst (word -> tokens) for acoustic model training", flush=True, ) # generating remapping for words: # among words with the same spelling take the most frequent word # use only this word in the lexicon # do this substitution for the dev during # acoustic model training for WER computation remap_dict = remap_words_with_same_spelling(args.data_dst, decoder_path) with open(os.path.join(lists_path, "si284.lst"), "r") as fin, open( os.path.join(am_path, "si284.lst.remap"), "w" ) as fout: for line in fin: line = line.strip().split(" ") for index in range(3, len(line)): word = re.sub(r"$\S+$", "", line[index]) line[index] = remap_dict[word] fout.write(" ".join(line) + "\n") # words used in training/eval to prepare spelling words_set = set() for name in [ os.path.join(am_path, "si284.lst.remap"), os.path.join(lists_path, "nov93dev.lst"), ]: with open(name, "r") as flist: for line in flist: transcription = line.strip().split(" ")[3:] words_set.update(transcription) print( "Writing lexicon file - {}...".format( os.path.join(am_path, "lexicon_si284+nov93dev.txt") ), flush=True, ) with open(os.path.join(am_path, "lexicon_si284+nov93dev.txt"), "w") as f: for word in words_set: spelling = get_spelling(word) assert re.match( r"[a-z'.]+", spelling ), "invalid spelling for word '{}'".format(word) f.write( "{word}\t{tokens} |\n".format( word=word, tokens=" ".join(list(spelling)) ) ) # Generating decoder/* # prepare lexicon word -> tokens spelling # write words to lexicon.txt file print("Generating lexicon.txt (word -> tokens) for decoding", flush=True) lex_file = os.path.join(decoder_path, "lexicon.lst") print("Writing lexicon file - {}...".format(lex_file), flush=True) with open(lex_file, "w") as f: for word in numpy.unique(list(remap_dict.values())): if len(re.findall(r"\d", word)) > 0: continue spelling = get_spelling(word) if spelling != "": if re.match("^[a-z'.]+$", spelling): f.write("{w}\t{s} |\n".format(w=word, s=" ".join(spelling))) else: print('Ignore word "{}" in lexicon'.format(word)) # Prepare data for char lm training/evaluation if os.path.exists(os.path.join(decoder_path, "char_lm_data.train")): print( "Skip generation of {}. Please remove the file to regenerate it".format( os.path.join(decoder_path, "char_lm_data.train") ) ) else: convert_words_to_letters_asg_rep2( os.path.join(args.data_dst, "text/lm.txt"), os.path.join(decoder_path, "char_lm_data.train"), ) convert_words_to_letters_asg_rep2( os.path.join(args.data_dst, "text/nov93dev.txt"), os.path.join(decoder_path, "char_lm_data.nov93dev"), ) with open(os.path.join(args.data_dst, "text/nov93dev.txt"), "r") as f, \ open(os.path.join(decoder_path, "word_lm_data.nov93dev"), "w") as fout: for line in f: result = [] for word in line.strip().split(" "): word = re.sub("[^a-z'.]+", "", word) if word != "": result.append(word) fout.write(" ".join(result) + "\n") print("Done!", flush=True)

recipes/lexicon_free/wsj/prepare.py (193 lines of code) (raw):