in recipes/data/switchboard/utils.py [0:0]
def process_hub5_data(sample_data):
line, idx, hub5_sdir, hub5_audio_path, sph2pipe = sample_data
if (not line) or line.startswith(";;") or ("IGNORE_TIME_SEGMENT_" in line):
return None
parts = line.strip().split()
transcript = " ".join(parts[6:])
transcript = transcript.replace("((", "(")
transcript = transcript.replace("<B_ASIDE>", "")
transcript = transcript.replace("<A_ASIDE>", "")
spk = "{}-{}".format(parts[0], parts[1])
start = float(parts[3])
end = float(parts[4])
utt = "{u}_{s}-{e}".format(
u=spk, s="{:06d}".format(int(start * 100)), e="{:06d}".format(int(end * 100))
)
in_file = os.path.join(hub5_sdir, "english", parts[0] + ".sph")
out_file = os.path.join(hub5_audio_path, "{:09d}.flac".format(idx))
tmp_file = os.path.join(hub5_audio_path, "{pid}_tmp.wav".format(pid=os.getpid()))
os.system(
"{sph} -f wav -c {c} {i} {o}".format(
sph=sph2pipe, c=1 if parts[1] == "A" else 2, i=in_file, o=tmp_file
)
)
assert (
sox.file_info.duration(tmp_file) > 0
), "Audio file {} duration is zero.".format(in_file)
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(file_type="flac", encoding="signed-integer", bits=16)
sox_tfm.trim(start, end)
sox_tfm.build(tmp_file, out_file)
os.remove(tmp_file)
duration = (end - start) * 1000.0
return "\t".join([utt, out_file, "{0:.2f}".format(duration), transcript.lower()])