recipes/conv_glu/wsj/prepare.py (124 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to prepare recipe to train/eval model on Librispeech in wav2letter++ pipelines
Please install `sph2pipe` on your own -
see https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools \
with commands :
wget https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz
tar -xzf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm
Command : python3 prepare_data.py --wsj0 [...]/WSJ0/media \
--wsj1 [...]/WSJ1/media --data_dst [...] --model_dst [...]
--sph2pipe [...]/sph2pipe_v2.5/sph2pipe --kenlm [...]
Replace [...] with appropriate paths
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import os
import re
from collections import defaultdict
import numpy
def get_spelling(word):
spelling = re.sub(r"\(\S+\)", "", word) # not pronounced
spelling = re.sub(r'[,\.:\-/&\?\!\(\)";\{\}\_#]+', "", spelling)
if word == "'single-quote":
spelling = spelling.replace("'", "")
return spelling
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Librispeech Dataset creation.")
parser.add_argument("--wsj0", help="top level directory containing all WSJ0 discs")
parser.add_argument("--wsj1", help="top level directory containing all WSJ1 discs")
parser.add_argument(
"--data_dst", help="data destination directory", default="./wsj"
)
parser.add_argument(
"--model_dst",
help="model auxilary files destination directory",
default="./conv_glu_librispeech_char",
)
parser.add_argument(
"--wsj1_type",
help="if you are using larger corpus LDC94S13A, set parameter to `LDC94S13A`",
default="LDC94S13B",
)
parser.add_argument(
"--sph2pipe",
help="path to sph2pipe executable",
default="./sph2pipe_v2.5/sph2pipe",
)
parser.add_argument("--kenlm", help="location to installed kenlm directory")
parser.add_argument(
"-p", "--process", help="# of process for Multiprocessing", default=8, type=int
)
args = parser.parse_args()
os.system(
"python3 {}/../../../data/wsj/prepare.py "
"--wsj0 {} --wsj1 {} --sph2pipe {} --wsj1_type {} --dst {} -p {}".format(
os.path.dirname(os.path.abspath(__file__)),
args.wsj0,
args.wsj1,
args.sph2pipe,
args.wsj1_type,
args.data_dst,
args.process,
)
)
lists_path = os.path.join(args.data_dst, "lists")
am_path = os.path.join(args.model_dst, "am")
lm_data_path = os.path.join(args.data_dst, "text/lm.txt")
decoder_path = os.path.join(args.model_dst, "decoder")
os.makedirs(am_path, exist_ok=True)
os.makedirs(decoder_path, exist_ok=True)
# Generating am/*
print("Generating tokens.txt for acoustic model training", flush=True)
with open(os.path.join(am_path, "tokens.txt"), "w") as f_tokens:
f_tokens.write("|\n")
f_tokens.write("'\n")
for alphabet in range(ord("a"), ord("z") + 1):
f_tokens.write(chr(alphabet) + "\n")
print(
"Generating lexicon.txt (word -> tokens) for acoustic model training",
flush=True,
)
# words used in training/eval to prepare spelling
words_set = set()
# words from lm data and train transcription for decoder
lexicon_dict = defaultdict(int)
for name in ["si284", "nov93dev"]:
with open(os.path.join(lists_path, name + ".lst"), "r") as flist:
for line in flist:
transcription = line.strip().split(" ")[3:]
words_set.update(transcription)
if name == "si284":
for word in transcription:
lexicon_dict[word] += 1
print(
"Writing lexicon file - {}...".format(
os.path.join(am_path, "lexicon_si284+nov93dev.txt")
),
flush=True,
)
with open(os.path.join(am_path, "lexicon_si284+nov93dev.txt"), "w") as f:
for word in words_set:
spelling = get_spelling(word)
assert re.match(
r"[a-z']+", spelling
), "invalid spelling for word '{}'".format(word)
f.write(
"{word}\t{tokens} |\n".format(
word=word, tokens=" ".join(list(spelling))
)
)
# Generating decoder/*
# prepare lexicon word -> tokens spelling
# write words to lexicon.txt file
print("Generating lexicon.txt (word -> tokens) for decoding", flush=True)
lex_file = os.path.join(decoder_path, "lexicon.txt")
print("Writing lexicon file - {}...".format(lex_file), flush=True)
with open(lex_file, "w") as f, open(lm_data_path, "r") as f_lm:
for line in f_lm:
for word in line.strip().split(" "):
lexicon_dict[word] += 1
sorted_indices = numpy.argsort(list(lexicon_dict.values()))[::-1]
words = list(lexicon_dict.keys())
for index in sorted_indices:
spelling = get_spelling(words[index])
if re.match("^[a-z']+$", spelling):
f.write("{w}\t{s} |\n".format(w=words[index], s=" ".join(spelling)))
else:
print('Ignore word "{}" in lexicon'.format(words[index]))
# Train 4-gram language model
train_data = os.path.join(decoder_path, "lm+si284.txt")
os.system(
"cp {lm_data} {dst} && cat {trans} >> {dst}".format(
lm_data=lm_data_path,
dst=train_data,
trans=os.path.join(args.data_dst, "text/si284.txt"),
)
)
lmplz = os.path.join(args.kenlm, "build", "bin", "lmplz")
binary = os.path.join(args.kenlm, "build", "bin", "build_binary")
lm_file = os.path.join(decoder_path, "lm-4g")
cmd = "{bin} -T /tmp -S 10G --discount_fallback -o 4 --text {file} > {lm_file}.arpa"
os.system(cmd.format(bin=lmplz, lm_file=lm_file, file=train_data))
os.system("{bin} {lm_file}.arpa {lm_file}.bin".format(bin=binary, lm_file=lm_file))
print("Done!", flush=True)