recipes/data/switchboard/prepare.py (142 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to package Switchboard, Hub05 datasets into a form readable in wav2letter++ pipelines Command : python3 prepare.py [-h] [--src SRC] [--dst DST] [--hub5_sdir HUB5_SDIR] [--hub5_tdir HUB5_TDIR] [--sph2pipe SPH2PIPE] [-p PROCESS] Replace [...] with appropriate path """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os import sys from multiprocessing import Pool import numpy from tqdm import tqdm from utils import process_hub5_data, process_swbd_data if __name__ == "__main__": parser = argparse.ArgumentParser(description="Switchboard Dataset creation.") parser.add_argument( "--src", help="path to directory containing Switchboard data - /path/to/LDC97S62,", ) parser.add_argument( "--dst", help="destination directory where to store data", default="./swbd" ) parser.add_argument( "--hub5_sdir", default=None, help="path to hub dataset containing speech data - /path/to/LDC2002S09/" "('<hub5_sdir>/english' must exist)", ) parser.add_argument( "--hub5_tdir", default=None, help="path to hub dataset containing transcript data " " - /path/to/LDC2002T43. ('<hub5_tdir>/reference' must exist)", ) parser.add_argument( "--sph2pipe", help="path to sph2pipe executable", default="./sph2pipe_v2.5/sph2pipe", ) parser.add_argument( "-p", "--process", help="number of process for multiprocessing", default=8, type=int, ) args = parser.parse_args() assert os.path.exists(args.sph2pipe), "sph2pipe not found '{d}'".format( d=args.sph2pipe ) audio_path = os.path.join(args.dst, "audio") os.makedirs(audio_path, exist_ok=True) text_path = os.path.join(args.dst, "text") os.makedirs(text_path, exist_ok=True) lists_path = os.path.join(args.dst, "lists") os.makedirs(lists_path, exist_ok=True) misc_path = os.path.join(args.dst, "misc") os.makedirs(misc_path, exist_ok=True) # hub dataset preparation if args.hub5_tdir and args.hub5_sdir: print("Preparing Hub'05 data ...", flush=True) hub5_audio_path = os.path.join(audio_path, "hub05") os.makedirs(hub5_audio_path, exist_ok=True) stm = os.path.join(args.hub5_tdir, "reference", "hub5e00.english.000405.stm") lines = [line.strip() for line in open(stm, "r")] n_samples = len(lines) with Pool(args.process) as p: processed_lines = list( tqdm( p.imap( process_hub5_data, zip( lines, numpy.arange(n_samples), [args.hub5_sdir] * n_samples, [hub5_audio_path] * n_samples, [args.sph2pipe] * n_samples, ), ), total=n_samples, ) ) with open(os.path.join(lists_path, "hub05-switchboard.lst"), "w") as sfile: sfile.write( "\n".join([l for l in processed_lines if l and l.startswith("sw")]) ) with open(os.path.join(lists_path, "hub05-callhome.lst"), "w") as cfile: cfile.write( "\n".join([l for l in processed_lines if l and l.startswith("en")]) ) else: print( "--hub5_tdir and/or --hub5_sdir is empty. Not preparing Hub'05 data.", flush=True, ) print("Preparing Switchboard data ...", flush=True) swbd_audio_path = os.path.join(audio_path, "switchboard") os.makedirs(swbd_audio_path, exist_ok=True) swbd_trans_path = os.path.join(misc_path, "swb_ms98_transcriptions") if not os.path.exists(swbd_trans_path): os.system( "wget -qO- http://www.openslr.org/resources/5/" "switchboard_word_alignments.tar.gz " "| tar xz -C {dir}".format(dir=misc_path) ) # load acronyms acronym_dict = {} with open(os.path.join(sys.path[0], "acronyms_swbd.map"), "r") as f: for line in f: a, b = line.strip().split("\t") acronym_dict[a] = b data = {} for dirpath, _, filenames in os.walk(swbd_trans_path): for filename in filenames: if filename.endswith("-trans.text"): id = filename[2:6] # Guaranteed to be id by swb manual if id not in data: data[id] = [id, None, None, None] channel = filename[6] if channel == "A": data[id][2] = os.path.join(dirpath, filename) if channel == "B": data[id][3] = os.path.join(dirpath, filename) for dirpath, _, filenames in os.walk(args.src): for filename in filenames: if filename.endswith(".sph"): id = filename.replace("sw0", "")[:4] assert id in data data[id][1] = os.path.join(dirpath, filename) n_samples = len(data) with Pool(args.process) as p: processed_lines = list( tqdm( p.imap( process_swbd_data, zip( data.values(), numpy.arange(n_samples), [swbd_audio_path] * n_samples, [args.sph2pipe] * n_samples, [acronym_dict] * n_samples, ), ), total=n_samples, ) ) processed_lines_flat = [item for sublist in processed_lines for item in sublist] with open(os.path.join(lists_path, "switchboard.lst"), "w") as sfile: sfile.write("\n".join([l for l in processed_lines_flat if l]))