data/librispeech/prepare.py

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to package original Librispeech datasets into a form readable in wav2letter++ pipelines Command : python3 prepare.py --dst [...] Replace [...] with appropriate path """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os from multiprocessing import Pool import numpy from tqdm import tqdm from utils import find_transcript_files, transcript_to_list LOG_STR = " To regenerate this file, please, remove it." if __name__ == "__main__": parser = argparse.ArgumentParser(description="Librispeech Dataset creation.") parser.add_argument( "--dst", help="destination directory where to store data", default="./librispeech", ) parser.add_argument( "-p", "--process", help="number of process for multiprocessing", default=8, type=int, ) args = parser.parse_args() subpaths = { "train": ["train-clean-100", "train-clean-360", "train-other-500"], "dev": ["dev-clean", "dev-other"], "test": ["test-clean", "test-other"], } subpath_names = numpy.concatenate(list(subpaths.values())) audio_path = os.path.join(args.dst, "audio") text_path = os.path.join(args.dst, "text") lists_path = os.path.join(args.dst, "lists") os.makedirs(audio_path, exist_ok=True) os.makedirs(text_path, exist_ok=True) os.makedirs(lists_path, exist_ok=True) audio_http = "http://www.openslr.org/resources/12/" text_http = "http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz" # Download the audio data print("Downloading the Librispeech data.", flush=True) for pname in subpath_names: if not os.path.exists(os.path.join(audio_path, "LibriSpeech", pname)): print("Downloading and unpacking {}...".format(pname)) cmd = """wget -c {http}{name}.tar.gz -P {path}; yes n 2>/dev/null | gunzip {path}/{name}.tar.gz; tar -C {path} -xf {path}/{name}.tar""" os.system(cmd.format(path=audio_path, http=audio_http, name=pname)) else: log_str = "{} part of data exists, skip its downloading and unpacking" print(log_str.format(pname) + LOG_STR, flush=True) # Downloading text data for language model training if not os.path.exists(os.path.join(text_path, "librispeech-lm-norm.txt")): print("Downloading and unpacking text data...") cmd = """wget -c {http} -P {path}; yes n 2>/dev/null | gunzip {path}/librispeech-lm-norm.txt.gz""" os.system(cmd.format(http=text_http, path=text_path)) else: print("Text data exists, skip its downloading." + LOG_STR, flush=True) # Prepare the audio data print("Converting audio data into necessary format.", flush=True) word_dict = {} for subpath_type in subpaths.keys(): word_dict[subpath_type] = set() for subpath in subpaths[subpath_type]: src = os.path.join(audio_path, "LibriSpeech", subpath) assert os.path.exists(src), "Unable to find the directory - '{src}'".format( src=src ) dst_list = os.path.join(lists_path, subpath + ".lst") if os.path.exists(dst_list): print( "Path {} exists, skip its generation.".format(dst_list) + LOG_STR, flush=True, ) continue print("Analyzing {src}...".format(src=src), flush=True) transcript_files = find_transcript_files(src) transcript_files.sort() print("Writing to {dst}...".format(dst=dst_list), flush=True) with Pool(args.process) as p: samples = list( tqdm( p.imap(transcript_to_list, transcript_files), total=len(transcript_files), ) ) with open(dst_list, "w") as fout: for sp in samples: for s in sp: word_dict[subpath_type].update(s[-1].split(" ")) s[0] = subpath + "-" + s[0] fout.write(" ".join(s) + "\n") # Prepare text data current_path = os.path.join(text_path, "librispeech-lm-norm.txt.lower.shuffle") if not os.path.exists(current_path): print("Prepare text data in the necessary format.", flush=True) numpy.random.seed(42) text_data = [] with open(os.path.join(text_path, "librispeech-lm-norm.txt"), "r") as f_text: for line in f_text: line = line.strip().lower() if line != "": text_data.append(line) indices = numpy.random.permutation(numpy.arange(len(text_data))) with open( os.path.join(text_path, "librispeech-lm-norm.txt.lower.shuffle"), "w" ) as f: for index in indices: f.write(text_data[index] + "\n") else: print( "Path {} exists, skip its generation.".format(current_path) + LOG_STR, flush=True, ) for pname in subpath_names: current_path = os.path.join(text_path, pname + ".txt") if not os.path.exists(current_path): with open(os.path.join(lists_path, pname + ".lst"), "r") as flist, open( os.path.join(text_path, pname + ".txt"), "w" ) as fout: for line in flist: fout.write(" ".join(line.strip().split(" ")[3:]) + "\n") else: print( "Path {} exists, skip its generation.".format(current_path) + LOG_STR, flush=True, ) print("Done!", flush=True)

data/librispeech/prepare.py (120 lines of code) (raw):