data/utils/kaldi_to_listfile.py (97 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to package kaldi data directory into a form readable in
wav2letter++ pipelines
Command : python3 prepare.py --src [...] --dst [...]
Replace [...] with appropriate path
`src` directory is the path to kaldi data directory typically
prepared with `prepare_data.sh` script.
`dst` directory is the path to store (segmented) audio files and the
list file that is used by wav2letter++ pipelines to load data.
"""
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)
import argparse
import os
import re
from multiprocessing import Pool
import sox
from tqdm import tqdm
def run_segment(item):
uid, val = item
infile, start_sec, end_sec, outfile = val
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="flac", encoding="signed-integer", bits=16
)
sox_tfm.trim(start_sec, end_sec)
sox_tfm.build(infile, outfile)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Script to generate list file from Kaldi data dir"
)
parser.add_argument(
"--src",
help="input kaldi data directory. Must contain "
"'text', 'segments' and 'wav.scp' files",
)
parser.add_argument(
"--dst", help="destination directory where to store data",
)
parser.add_argument(
"--name", help="name of the output list file", default="data.lst"
)
parser.add_argument(
"-p",
"--process",
help="number of process for multiprocessing",
default=8,
type=int,
)
args = parser.parse_args()
wav_files = {}
cache = {}
cmds = []
with open(f"{args.src}/wav.scp") as f:
for line in f:
# handles two possible cases
# Case 1: ID followed by wav file
# Ex: S03_U01.CH1 /path/S03_U01.CH1.wav
# Case 2: ID followed by sox script
# Ex: P09_S03.L sox /path/S03_P09.wav -t wav - remix 1 |
wid, wav_handle = line.strip().split(" ", 1)
if wav_handle in cache:
wav_file = cache[wav_handle]
elif wav_handle.startswith("sox"):
hsh = re.sub("[^0-9a-zA-Z]+", "", wav_handle)
wav_file = "/tmp/{}.wav".format(hsh)
cmds.append(
wav_handle.replace(" - ", " " + wav_file + " ").replace(
"|", ""
)
)
else:
wav_file = wav_handle
wav_files[wid] = wav_file
print("Found {} wav files".format(len(wav_files)))
print("Running {} wav commands ...".format(len(cmds)))
def run_command(cmd):
os.system(cmd)
p = Pool(args.process)
list(tqdm(p.imap(run_command, cmds), total=len(cmds),))
transcripts = {}
with open(f"{args.src}/text") as f:
for line in f:
line_split = line.strip().split()
transcripts[line_split[0]] = " ".join(line_split[1:])
print("Found {} transcripts".format(len(transcripts)))
segments = {}
with open(f"{args.src}/segments") as f:
for line in f:
uid, wid, start_sec, end_sec = line.strip().split(" ", 3)
start_sec = float(start_sec)
end_sec = float(end_sec)
outfile = f"{args.dst}/audio/{uid}.flac"
segments[uid] = (wav_files[wid], start_sec, end_sec, outfile)
print("Found {} segments".format(len(segments)))
os.makedirs(f"{args.dst}", exist_ok=True)
os.makedirs(f"{args.dst}/audio", exist_ok=True)
print("Creating segmented audio files ...")
list(tqdm(p.imap(run_segment, segments.items()), total=len(segments),))
print("Writing to list file ...")
with open(f"{args.dst}/{args.name}", "w") as fo:
for uid, val in segments.items():
_, start_sec, end_sec, outfile = val
duration = "{:.2f}".format((end_sec - start_sec) * 1000)
fo.write(
"\t".join([uid, outfile, duration, transcripts[uid]]) + "\n"
)
print("Done!")