data/timit/prepare.py (133 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to package original Timit dataset into a form readable in wav2letter++ pipelines Please install `sph2pipe` on your own - see https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools \ with commands : wget https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz tar -xzf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 gcc -o sph2pipe *.c -lm Command : python3 prepare.py --src [...]/timit --dst [...] \ --sph2pipe [...]/sph2pipe_v2.5/sph2pipe Replace [...] with appropriate paths """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os from multiprocessing import Pool import numpy import sox from tqdm import tqdm def copy_to_flac(info): src, name, dst, idx, sph2pipe = info original_path = os.path.join(src, name) path = os.path.join(dst, "%09d" % idx) + ".flac" if not os.path.exists(path): tmp_file = os.path.join(dst, "{pid}_tmp.wav".format(pid=os.getpid())) os.system( "{sph} -f wav {i} {o}".format(sph=sph2pipe, i=original_path, o=tmp_file) ) assert ( sox.file_info.duration(tmp_file) > 0 ), "Audio file {} duration is zero.".format(original_path) sox_tfm = sox.Transformer() sox_tfm.set_output_format(file_type="flac", encoding="signed-integer", bits=16) sox_tfm.build(tmp_file, path) os.remove(tmp_file) duration = sox.file_info.duration(path) * 1000 # miliseconds transcripts = dict() for target_type in [".PHN", ".WRD"]: targets = [] target_file = original_path.replace(".WAV", target_type) with open(target_file, "r") as f: for line in f: start, end, token = line.strip().split() assert start and end and token, "Something wrong with {} file".format( target_file ) targets.append(token) transcripts[target_type] = " ".join(targets) return (name, path, duration, transcripts[".WRD"], transcripts[".PHN"]) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Timit Dataset creation.") parser.add_argument( "--src", help="Source directory with downloaded and unzipped TIMIT data" ) parser.add_argument("--dst", help="destination directory", default="./timit") parser.add_argument( "-p", "--process", help="# of process for Multiprocessing", default=8, type=int ) parser.add_argument( "--sph2pipe", help="path to sph2pipe executable", default="./sph2pipe_v2.5/sph2pipe", ) args = parser.parse_args() assert os.path.isdir(str(args.src)), "Timit directory is not found - '{d}'".format( d=args.src ) assert os.path.exists(args.sph2pipe), "sph2pipe not found '{d}'".format( d=args.sph2pipe ) current_dir = os.path.dirname(__file__) audio_path = os.path.join(args.dst, "audio") text_path = os.path.join(args.dst, "text") lists_path = os.path.join(args.dst, "lists") os.makedirs(audio_path, exist_ok=True) os.makedirs(text_path, exist_ok=True) os.makedirs(lists_path, exist_ok=True) # read phone tokens phones = [] in_phn_path = os.path.join(current_dir, "phones.txt") with open(in_phn_path, "r") as f_phones: phones = [[tkn.strip() for tkn in line.split()] for line in f_phones] phones = set(numpy.concatenate(phones)) assert ( len(phones) == 61 ), "Wrong number of phones, should be 61 instrad of {}".format(len(phones)) assert os.path.exists(os.path.join(args.src, "timit")) or os.path.exists( os.path.join(args.src, "TIMIT") ), "TIMIT data are corrupted, there is no TIMIT or timit subdirectory" upper_case = True if os.path.exists(os.path.join(args.src, "TIMIT")) else False def process_path(path, upper_case): return path.upper() if upper_case else path # prepare audio, text and lists for ds_type in ["train", "valid", "test"]: print("Writing TIMIT {} data part".format(ds_type), flush=True) data_list = os.path.join(current_dir, ds_type + ".lst") with open(data_list, "r") as f_paths: src_audio_files = [ process_path(os.path.join("timit", fname.strip()), upper_case) for fname in f_paths if fname.strip() != "" ] ds_dst = os.path.join(audio_path, ds_type) os.makedirs(ds_dst, exist_ok=True) n_samples = len(src_audio_files) with Pool(args.process) as p: samples_info = list( tqdm( p.imap( copy_to_flac, zip( [args.src] * n_samples, src_audio_files, [ds_dst] * n_samples, numpy.arange(n_samples), [args.sph2pipe] * n_samples, ), ), total=n_samples, ) ) with open( os.path.join(lists_path, "{}.phn.lst".format(ds_type)), "w" ) as flist, open( os.path.join(lists_path, "{}.lst".format(ds_type)), "w" ) as fwlist, open( os.path.join(text_path, "{}.phn.txt".format(ds_type)), "w" ) as ftlist, open( os.path.join(text_path, "{}.txt".format(ds_type)), "w" ) as ftwlist: for sample in samples_info: flist.write( "{}\t{}\t{}\t{}\n".format( sample[0], sample[1], sample[2], sample[4] ) ) fwlist.write( "{}\t{}\t{}\t{}\n".format( sample[0], sample[1], sample[2], sample[3] ) ) assert ( len(set(sample[4].split(" ")) - phones) == 0 ), "Wrong phones in the transcription for sample {}".format(sample[0]) ftlist.write("{}\n".format(sample[4])) ftwlist.write("{}\n".format(sample[3])) print("Done!", flush=True)