in data/wsj/utils.py [0:0]
def find_transcripts(dst_paths):
transcripts = dict()
for ds_path in dst_paths:
for dirpath, _, filenames in os.walk(ds_path):
for filename in filenames:
if not filename.endswith(".dot"):
continue
full_path = os.path.join(dirpath, filename)
subset = full_path.split(os.sep)[-3]
assert subset, "Subset is empty"
transcripts.setdefault(subset, dict())
with open(full_path, "r") as f:
for line in f:
transcript, file_id = line.strip().rsplit(" ", 1)
file_id = file_id.strip("()")
if not transcript or not file_id:
continue
if subset in transcripts and file_id in transcripts[subset]:
assert (
transcripts[subset][file_id] == transcript
), "different transcriptions available for {i}".format(
i=file_id
)
transcripts[subset][file_id] = transcript
return transcripts