data/ami/prepare.py (86 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
----------
Script to package original AMI dataset into a form readable in
wav2letter++ pipelines
Command : python3 prepare.py --dst [...]
Replace [...] with appropriate path
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import os
from multiprocessing import Pool
from tqdm import tqdm
from utils import split_audio, create_limited_sup
LOG_STR = " To regenerate this file, please, remove it."
MIN_DURATION_MSEC = 50 # 50 msec
MAX_DURATION_MSEC = 30000 # 30 sec
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="AMI Dataset creation.")
parser.add_argument(
"--dst",
help="destination directory where to store data",
default="./ami",
)
parser.add_argument(
"-p",
"--process",
help="number of process for multiprocessing",
default=8,
type=int,
)
args = parser.parse_args()
splits = {"train": [], "dev": [], "test": []}
audio_path = os.path.join(args.dst, "audio")
text_path = os.path.join(args.dst, "text")
lists_path = os.path.join(args.dst, "lists")
os.makedirs(audio_path, exist_ok=True)
os.makedirs(text_path, exist_ok=True)
os.makedirs(lists_path, exist_ok=True)
audio_http = "http://groups.inf.ed.ac.uk/ami"
# Download the audio data
print("Downloading the AMI audio data...", flush=True)
cmds = []
for split in splits.keys():
with open(os.path.join("splits", f"split_{split}.orig")) as f:
for line in f:
line = line.strip()
splits[split].append(line)
cur_audio_path = os.path.join(audio_path, line)
os.makedirs(cur_audio_path, exist_ok=True)
num_meetings = 5 if line in ["EN2001a", "EN2001d", "EN2001e"] else 4
for meetid in range(num_meetings):
cmds.append(
f"wget -nv --continue -o /dev/null -P {cur_audio_path} {audio_http}/AMICorpusMirror/amicorpus/{line}/audio/{line}.Headset-{meetid}.wav"
)
for i in tqdm(range(len(cmds))):
os.system(cmds[i])
print("Downloading the text data ...", flush=True)
annotver = "ami_public_manual_1.6.1.zip"
cmd = f"wget -nv --continue -o /dev/null -P {text_path} {audio_http}/AMICorpusAnnotations/{annotver};"
cmd = cmd + f"mkdir -p {text_path}/annotations;"
cmd = cmd + f"unzip -q -o -d {text_path}/annotations {text_path}/{annotver} ;"
os.system(cmd)
print("Parsing the transcripts ...", flush=True)
cmd = f"sh ami_xml2text.sh {text_path};"
os.system(cmd)
cmd = f"perl ami_split_segments.pl {text_path}/annotations/transcripts1 {text_path}/annotations/transcripts2 2>&1 > {text_path}/annotations/split_segments.log"
os.system(cmd)
# Prepare the audio data
print("Segmenting audio files...", flush=True)
with open(f"{text_path}/annotations/transcripts2") as f:
lines = f.readlines()
lines = [audio_path + " " + line for line in lines]
os.makedirs(os.path.join(audio_path, "segments"), exist_ok=True)
with Pool(args.process) as p:
samples = list(
tqdm(
p.imap(split_audio, lines),
total=len(lines),
)
)
samples = [s for s in samples if s is not None] # filter None values
print("Wrote {} audio segment samples".format(len(samples)))
print("Writing to list files...", flush=True)
for split, meetings in splits.items():
cur_samples = [s for s in samples if s[0] in meetings]
with open(os.path.join(lists_path, f"{split}.lst"), "w") as fout:
for sample in cur_samples:
if (
float(sample[3]) > MIN_DURATION_MSEC
and float(sample[3]) < MAX_DURATION_MSEC
):
fout.write("\t".join(sample[1:]) + "\n")
print("Preparing limited supervision subsets", flush=True)
create_limited_sup(lists_path)
print("Done!", flush=True)