data/timit/prepare.py (133 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to package original Timit dataset into a form readable in
wav2letter++ pipelines
Please install `sph2pipe` on your own -
see https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools \
with commands :
wget https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz
tar -xzf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm
Command : python3 prepare.py --src [...]/timit --dst [...] \
--sph2pipe [...]/sph2pipe_v2.5/sph2pipe
Replace [...] with appropriate paths
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import os
from multiprocessing import Pool
import numpy
import sox
from tqdm import tqdm
def copy_to_flac(info):
src, name, dst, idx, sph2pipe = info
original_path = os.path.join(src, name)
path = os.path.join(dst, "%09d" % idx) + ".flac"
if not os.path.exists(path):
tmp_file = os.path.join(dst, "{pid}_tmp.wav".format(pid=os.getpid()))
os.system(
"{sph} -f wav {i} {o}".format(sph=sph2pipe, i=original_path, o=tmp_file)
)
assert (
sox.file_info.duration(tmp_file) > 0
), "Audio file {} duration is zero.".format(original_path)
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(file_type="flac", encoding="signed-integer", bits=16)
sox_tfm.build(tmp_file, path)
os.remove(tmp_file)
duration = sox.file_info.duration(path) * 1000 # miliseconds
transcripts = dict()
for target_type in [".PHN", ".WRD"]:
targets = []
target_file = original_path.replace(".WAV", target_type)
with open(target_file, "r") as f:
for line in f:
start, end, token = line.strip().split()
assert start and end and token, "Something wrong with {} file".format(
target_file
)
targets.append(token)
transcripts[target_type] = " ".join(targets)
return (name, path, duration, transcripts[".WRD"], transcripts[".PHN"])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Timit Dataset creation.")
parser.add_argument(
"--src", help="Source directory with downloaded and unzipped TIMIT data"
)
parser.add_argument("--dst", help="destination directory", default="./timit")
parser.add_argument(
"-p", "--process", help="# of process for Multiprocessing", default=8, type=int
)
parser.add_argument(
"--sph2pipe",
help="path to sph2pipe executable",
default="./sph2pipe_v2.5/sph2pipe",
)
args = parser.parse_args()
assert os.path.isdir(str(args.src)), "Timit directory is not found - '{d}'".format(
d=args.src
)
assert os.path.exists(args.sph2pipe), "sph2pipe not found '{d}'".format(
d=args.sph2pipe
)
current_dir = os.path.dirname(__file__)
audio_path = os.path.join(args.dst, "audio")
text_path = os.path.join(args.dst, "text")
lists_path = os.path.join(args.dst, "lists")
os.makedirs(audio_path, exist_ok=True)
os.makedirs(text_path, exist_ok=True)
os.makedirs(lists_path, exist_ok=True)
# read phone tokens
phones = []
in_phn_path = os.path.join(current_dir, "phones.txt")
with open(in_phn_path, "r") as f_phones:
phones = [[tkn.strip() for tkn in line.split()] for line in f_phones]
phones = set(numpy.concatenate(phones))
assert (
len(phones) == 61
), "Wrong number of phones, should be 61 instrad of {}".format(len(phones))
assert os.path.exists(os.path.join(args.src, "timit")) or os.path.exists(
os.path.join(args.src, "TIMIT")
), "TIMIT data are corrupted, there is no TIMIT or timit subdirectory"
upper_case = True if os.path.exists(os.path.join(args.src, "TIMIT")) else False
def process_path(path, upper_case):
return path.upper() if upper_case else path
# prepare audio, text and lists
for ds_type in ["train", "valid", "test"]:
print("Writing TIMIT {} data part".format(ds_type), flush=True)
data_list = os.path.join(current_dir, ds_type + ".lst")
with open(data_list, "r") as f_paths:
src_audio_files = [
process_path(os.path.join("timit", fname.strip()), upper_case)
for fname in f_paths
if fname.strip() != ""
]
ds_dst = os.path.join(audio_path, ds_type)
os.makedirs(ds_dst, exist_ok=True)
n_samples = len(src_audio_files)
with Pool(args.process) as p:
samples_info = list(
tqdm(
p.imap(
copy_to_flac,
zip(
[args.src] * n_samples,
src_audio_files,
[ds_dst] * n_samples,
numpy.arange(n_samples),
[args.sph2pipe] * n_samples,
),
),
total=n_samples,
)
)
with open(
os.path.join(lists_path, "{}.phn.lst".format(ds_type)), "w"
) as flist, open(
os.path.join(lists_path, "{}.lst".format(ds_type)), "w"
) as fwlist, open(
os.path.join(text_path, "{}.phn.txt".format(ds_type)), "w"
) as ftlist, open(
os.path.join(text_path, "{}.txt".format(ds_type)), "w"
) as ftwlist:
for sample in samples_info:
flist.write(
"{}\t{}\t{}\t{}\n".format(
sample[0], sample[1], sample[2], sample[4]
)
)
fwlist.write(
"{}\t{}\t{}\t{}\n".format(
sample[0], sample[1], sample[2], sample[3]
)
)
assert (
len(set(sample[4].split(" ")) - phones) == 0
), "Wrong phones in the transcription for sample {}".format(sample[0])
ftlist.write("{}\n".format(sample[4]))
ftwlist.write("{}\n".format(sample[3]))
print("Done!", flush=True)