in data/wsj/utils.py [0:0]
def ndx_to_samples(prefix, filename, transcripts, transform=None, sep="-"):
samples_list = []
with open(os.path.join(prefix, filename), "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith(";"):
continue
if transform is not None:
line = transform(line)
if line is None:
continue
pre, suf = line.split(":")
p1, p2, p3 = pre.split("_")
suf = suf.lstrip(" /")
ds, subset, _, sample_id = suf.replace(".wv1", "").rsplit("/", 3)
fname = os.path.join(prefix, "{}{}{}.{}".format(p1, sep, p2, p3), suf)
assert os.path.exists(fname), "Audio file {} doesn't exist".format(fname)
assert (
subset in transcripts
), "Subset {} is absent in the transcription".format(subset)
assert (
sample_id in transcripts[subset]
), "Id {} is absent in the subset {} of transcription for file {}".format(
sample_id, subset, fname
)
samples_list.append(
{
"id": sample_id,
"filename": fname,
"subset": subset,
"transcript": transcripts[subset][sample_id],
"basename": os.path.join("{}{}{}.{}".format(p1, sep, p2, p3), suf),
}
)
samples_list.sort(key=lambda x: x["id"])
return samples_list