def ndx_to_samples()

in data/wsj/utils.py [0:0]


def ndx_to_samples(prefix, filename, transcripts, transform=None, sep="-"):
    samples_list = []
    with open(os.path.join(prefix, filename), "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith(";"):
                continue
            if transform is not None:
                line = transform(line)
            if line is None:
                continue
            pre, suf = line.split(":")
            p1, p2, p3 = pre.split("_")
            suf = suf.lstrip(" /")
            ds, subset, _, sample_id = suf.replace(".wv1", "").rsplit("/", 3)

            fname = os.path.join(prefix, "{}{}{}.{}".format(p1, sep, p2, p3), suf)

            assert os.path.exists(fname), "Audio file {} doesn't exist".format(fname)
            assert (
                subset in transcripts
            ), "Subset {} is absent in the transcription".format(subset)
            assert (
                sample_id in transcripts[subset]
            ), "Id {} is absent in the subset {} of transcription for file {}".format(
                sample_id, subset, fname
            )

            samples_list.append(
                {
                    "id": sample_id,
                    "filename": fname,
                    "subset": subset,
                    "transcript": transcripts[subset][sample_id],
                    "basename": os.path.join("{}{}{}.{}".format(p1, sep, p2, p3), suf),
                }
            )
    samples_list.sort(key=lambda x: x["id"])
    return samples_list