recipes/local_prior_match/librispeech/prepare_unpaired.py (54 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to prepare unpaired data for training a model with local prior matching
Command : python3 prepare_unpaired.py --data_dst [...] --model_dst [...]
Replace [...] with appropriate paths
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import os
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Librispeech Dataset creation.")
parser.add_argument(
"--data_dst", help="data destination directory", default="./librispeech"
)
parser.add_argument(
"--model_dst",
help="model auxilary files destination directory",
default="./lpm_librispeech",
)
args = parser.parse_args()
subpaths = {
"unpaired": ["train-clean-360", "train-other-500"],
}
lists_path = os.path.join(args.data_dst, "lists")
am_path = os.path.join(args.model_dst, "am")
unpaired_lists_path = os.path.join(args.model_dst, "lpm_data")
reflen_dict = set()
for name in subpaths["unpaired"]:
unpaired_data = {}
with open(os.path.join(lists_path, name + ".lst"), "r") as flist:
for line in flist:
file_tag, audio_path, audio_length, _ = line.strip().split(" ", 3)
unpaired_data[file_tag] = (audio_path, audio_length)
with open(
os.path.join(unpaired_lists_path, name + "-viterbi.out"), "r"
) as fdata:
with open(
os.path.join(unpaired_lists_path, name + "-lpm.lst"), "w"
) as fout:
for line in fdata:
file_tag, reflen = line.strip().split(" ", 1)
fout.write(
"%s %s %s %s\n" % (
file_tag,
unpaired_data[file_tag][0],
unpaired_data[file_tag][1],
reflen
)
)
reflen_dict.add(reflen)
# append reflen* to the new lexicon
orig_lexicon = "librispeech-paired-train+dev-unigram-5000-nbest10.lexicon"
lpm_lexicon = \
"librispeech-paired-train-unpaired-viterbi+dev-unigram-5000-nbest10.lexicon"
with open(os.path.join(am_path, lpm_lexicon), "w") as fout:
with open(os.path.join(am_path, orig_lexicon), "r") as fin:
for line in fin:
fout.write(line)
for r in reflen_dict:
# r's format is "reflen1", "reflen2", ... "reflen100", etc.
fout.write(r + "\t" + " ".join(["a"] * int(r[6:])) + "\n")
print("Done!", flush=True)