data/wsj/utils.py (110 lines of code) (raw):
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import re
import sox
def preprocess_word(word):
word = re.sub(r"^~$", "", word)
word = re.sub(r"^~~$", "", word)
word = re.sub(r"\\", "", word)
word = re.sub(r"^\[<\S+\]$", "", word)
word = re.sub(r"^\[\S+>\]$", "", word)
word = re.sub(r"^\[\S+/\]$", "", word)
word = re.sub(r"^\[/\S+\]$", "", word)
word = re.sub(r"^\[\S+\]$", "", word) # NOISE
if re.match(r"^<\S+>$", word) and word != "<NOISE>":
word = word[1:-1]
word = word.replace("*", "") if re.match(r"^\*\S+\*", word) else word
word = re.sub(r"^%PERCENT$", "PERCENT", word)
word = re.sub(r"^\.POINT$", "POINT", word)
word = re.sub(r"`", "'", word) # typo
word = re.sub(r"^\(IN\-PARENTHESIS$", "(IN-PARENTHESES", word) # mispell
word = re.sub(r"^Corp;$", "Corp", word) # mispell
word = re.sub(r"^\-\-DASH$", "-DASH", word) # mispell
if word != ":COLON":
word = word.replace(":", "") # some emphasis stuff
if word != "!EXCLAMATION-POINT":
word = word.replace("!", "") # some emphasis stuff
word = re.sub(r"^\.$", "", word)
word = word.lower()
return word
def find_transcripts(dst_paths):
transcripts = dict()
for ds_path in dst_paths:
for dirpath, _, filenames in os.walk(ds_path):
for filename in filenames:
if not filename.endswith(".dot"):
continue
full_path = os.path.join(dirpath, filename)
subset = full_path.split(os.sep)[-3]
assert subset, "Subset is empty"
transcripts.setdefault(subset, dict())
with open(full_path, "r") as f:
for line in f:
transcript, file_id = line.strip().rsplit(" ", 1)
file_id = file_id.strip("()")
if not transcript or not file_id:
continue
if subset in transcripts and file_id in transcripts[subset]:
assert (
transcripts[subset][file_id] == transcript
), "different transcriptions available for {i}".format(
i=file_id
)
transcripts[subset][file_id] = transcript
return transcripts
def ndx_to_samples(prefix, filename, transcripts, transform=None, sep="-"):
samples_list = []
with open(os.path.join(prefix, filename), "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith(";"):
continue
if transform is not None:
line = transform(line)
if line is None:
continue
pre, suf = line.split(":")
p1, p2, p3 = pre.split("_")
suf = suf.lstrip(" /")
ds, subset, _, sample_id = suf.replace(".wv1", "").rsplit("/", 3)
fname = os.path.join(prefix, "{}{}{}.{}".format(p1, sep, p2, p3), suf)
assert os.path.exists(fname), "Audio file {} doesn't exist".format(fname)
assert (
subset in transcripts
), "Subset {} is absent in the transcription".format(subset)
assert (
sample_id in transcripts[subset]
), "Id {} is absent in the subset {} of transcription for file {}".format(
sample_id, subset, fname
)
samples_list.append(
{
"id": sample_id,
"filename": fname,
"subset": subset,
"transcript": transcripts[subset][sample_id],
"basename": os.path.join("{}{}{}.{}".format(p1, sep, p2, p3), suf),
}
)
samples_list.sort(key=lambda x: x["id"])
return samples_list
def convert_to_flac(sample_data):
sample, idx, dst, sph2pipe = sample_data
filename = sample["filename"]
out_prefix = os.path.join(dst, "%09d" % idx)
# flac
if not os.path.exists(out_prefix + ".flac"):
tmp_file = os.path.join(dst, "{pid}_tmp.wav".format(pid=os.getpid()))
os.system("{sph} -f wav {i} {o}".format(sph=sph2pipe, i=filename, o=tmp_file))
assert (
sox.file_info.duration(tmp_file) > 0
), "Audio file {} duration is zero.".format(filename)
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(file_type="flac", encoding="signed-integer", bits=16)
sox_tfm.build(tmp_file, out_prefix + ".flac")
os.remove(tmp_file)
duration = sox.file_info.duration(out_prefix + ".flac") * 1000 # miliseconds
transcript = " ".join(
[preprocess_word(word) for word in sample["transcript"].split()]
)
transcript = re.sub(" +", " ", transcript).strip()
return [sample["basename"], out_prefix + ".flac", str(duration), transcript]