def process_fisher_data()

in recipes/data/fisher/utils.py [0:0]


def process_fisher_data(sample_data):
    files, _, audio_path, sph2pipe = sample_data
    sphfile, tfile = files
    tmp_files = {}
    for channel in ["A", "B"]:
        tmp_files[channel] = os.path.join(
            audio_path, "{pid}_tmp_{ch}.wav".format(pid=os.getpid(), ch=channel)
        )
        os.system(
            "{sph} -f wav -c {c} {i} {o}".format(
                sph=sph2pipe,
                c=1 if channel == "A" else 2,
                i=sphfile,
                o=tmp_files[channel],
            )
        )
    idx = 0
    lines = []
    with open(tfile, "r") as f:
        first_line = f.readline().strip()
        assert first_line.startswith("#") and first_line.endswith(".sph")
        audiofileid = first_line.replace("#", "").replace(".sph", "").strip()
        cur_audio_path = os.path.join(audio_path, audiofileid)
        os.makedirs(cur_audio_path, exist_ok=True)
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            tag, text = line.strip().split(":", 1)
            start, end, channel = tag.split()
            start = float(start)
            end = float(end)
            utt = "{a}-{c}-{s}-{e}".format(
                a=audiofileid,
                c=channel,
                s="{:06d}".format(int(start * 100 + 0.5)),
                e="{:06d}".format(int(end * 100 + 0.5)),
            )

            # ignore uncertain annotations
            if "((" in text:
                continue

            # lower-case
            text = text.lower()

            # remove punctuation
            text = text.replace("?", "")
            text = text.replace(",", "")

            # simplify noise annotations
            text = text.replace("[[skip]]", "")
            text = text.replace("[pause]", "")

            text = text.replace("[laugh]", "[laughter]")

            text = text.replace("[sigh]", "[noise]")
            text = text.replace("[cough]", "[noise]")
            text = text.replace("[mn]", "[noise]")
            text = text.replace("[breath]", "[noise]")
            text = text.replace("[lipsmack]", "[noise]")
            text = text.replace("[sneeze]", "[noise]")

            text = " ".join(text.split())

            out_file = os.path.join(cur_audio_path, "{:09d}.flac".format(idx))
            sox_tfm = sox.Transformer()
            sox_tfm.set_output_format(
                file_type="flac", encoding="signed-integer", bits=16
            )
            sox_tfm.trim(start, end)
            sox_tfm.build(tmp_files[channel], out_file)
            duration = (end - start) * 1000.0
            idx = idx + 1
            lines.append("\t".join([utt, out_file, "{0:.2f}".format(duration), text]))

    # cleanup
    for tmp in tmp_files.values():
        os.remove(tmp)

    return lines