data/mls/prepare.py (37 lines of code) (raw):

""" Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. This source code is licensed under the MIT-style license found in the LICENSE file in the root directory of this source tree. ---------- Script to package original MLS dataset into a form readable in wav2letter++ pipelines Command : python3 prepare.py --indir [...] --outdir [...] Replace [...] with appropriate path """ from __future__ import absolute_import, division, print_function, unicode_literals import argparse import os if __name__ == "__main__": parser = argparse.ArgumentParser(description="MLS Dataset preparation.") parser.add_argument( "--indir", help="input directory of downloaded MLS dataset of a given language", ) parser.add_argument( "--outdir", help="destination directory where to store data", ) args = parser.parse_args() os.makedirs(args.outdir, exist_ok=True) lists_path = os.path.join(args.outdir, "lists") os.makedirs(lists_path, exist_ok=True) # Preparing the list file for split in ["train", "dev", "test"]: audio_path = os.path.join(args.indir, split, "audio") segments_path = os.path.join(args.indir, split, "segments.txt") transcripts_path = os.path.join(args.indir, split, "transcripts.txt") list_out_path = os.path.join(lists_path, f"{split}.lst") # read the segments file for audio durations durations = {} with open(segments_path) as f: for line in f: cols = line.split() duration_ms = (float(cols[3]) - float(cols[2])) * 1000 durations[cols[0]] = "{:.2f}".format(duration_ms) with open(list_out_path, 'w') as fo: with open(transcripts_path) as fi: for line in fi: handle, transcript = line.split("\t") speaker, book, idx = handle.split("_") audio_file = os.path.join(audio_path, speaker, book, f"{handle}.flac") assert os.path.exists(audio_file) fo.write(handle + "\t" + audio_file + "\t" + durations[handle] + "\t" + transcript) print("Done!", flush=True)