recipes/data/fisher/prepare.py (61 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to package original Fisher datasets into a form readable in
wav2letter++ pipelines
Command : python3 prepare.py --dst [...]
Replace [...] with appropriate path
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import os
from multiprocessing import Pool
import numpy
from tqdm import tqdm
from utils import find_files, process_fisher_data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fisher Dataset creation.")
parser.add_argument(
"--src",
help="comma-separated directories containing Fisher data -"
"/path/to/LDC2004T19,/path/to/LDC2005T19,"
"/path/to/LDC2004S13,/path/to/LDC2005S13",
)
parser.add_argument(
"--dst", help="destination directory where to store data", default="./fisher"
)
parser.add_argument(
"--sph2pipe",
help="path to sph2pipe executable",
default="./sph2pipe_v2.5/sph2pipe",
)
parser.add_argument(
"-p",
"--process",
help="number of process for multiprocessing",
default=8,
type=int,
)
args = parser.parse_args()
files = find_files(args.src)
assert len(files) == 11699, (
"Expected to find 11699 .sph and transcript files in the Fisher "
"data, found {}".format(len(files))
)
audio_path = os.path.join(args.dst, "audio")
text_path = os.path.join(args.dst, "text")
lists_path = os.path.join(args.dst, "lists")
os.makedirs(audio_path, exist_ok=True)
os.makedirs(text_path, exist_ok=True)
os.makedirs(lists_path, exist_ok=True)
n_samples = len(files)
with Pool(args.process) as p:
processed_lines = list(
tqdm(
p.imap(
process_fisher_data,
zip(
files,
numpy.arange(n_samples),
[audio_path] * n_samples,
[args.sph2pipe] * n_samples,
),
),
total=n_samples,
)
)
processed_lines_flat = [item for sublist in processed_lines for item in sublist]
with open(os.path.join(lists_path, "fisher.lst"), "w") as ffile:
ffile.write("\n".join([l for l in processed_lines_flat if l]))