def process_swbd

def process_swbd_data()

in recipes/data/switchboard/utils.py [0:0]
49 lines of code
6 McCabe index (conditional complexity)

def process_swbd_data(sample_data):
    data, _, swbd_audio_path, sph2pipe, acronym_dict = sample_data
    id, sphfile, chA, chB = data
    tmp_file = os.path.join(swbd_audio_path, "{pid}_tmp.wav".format(pid=os.getpid()))
    cur_audio_path = os.path.join(swbd_audio_path, id)
    os.makedirs(cur_audio_path, exist_ok=True)
    idx = 0
    lines = []
    for channel in ["A", "B"]:
        os.system(
            "{sph} -f wav -c {c} {i} {o}".format(
                sph=sph2pipe, c=1 if channel == "A" else 2, i=sphfile, o=tmp_file
            )
        )
        assert (
            sox.file_info.duration(tmp_file) > 0
        ), "Audio file {} duration is zero.".format(sphfile)
        with open(chA if channel == "A" else chB, "r") as f:
            for line in f:
                name = line[0:6].replace("sw", "sw0")
                channel = line[6]
                splits = line.strip().split(" ", 3)
                start = float(splits[1])
                end = float(splits[2])
                transcript = sanitize(splits[3], acronym_dict)
                if not transcript:
                    continue
                utt = "{n}-{c}_{s}-{e}".format(
                    n=name,
                    c=channel,
                    s="{:06d}".format(int(start * 100 + 0.5)),
                    e="{:06d}".format(int(end * 100 + 0.5)),
                )
                out_file = os.path.join(cur_audio_path, "{:09d}.flac".format(idx))
                sox_tfm = sox.Transformer()
                sox_tfm.set_output_format(
                    file_type="flac", encoding="signed-integer", bits=16
                )
                sox_tfm.trim(start, end)
                sox_tfm.build(tmp_file, out_file)
                duration = (end - start) * 1000.0
                idx = idx + 1
                lines.append(
                    "\t".join(
                        [utt, out_file, "{0:.2f}".format(duration), transcript.lower()]
                    )
                )
        os.remove(tmp_file)
    return lines