def find_transcripts()

in data/wsj/utils.py [0:0]


def find_transcripts(dst_paths):
    transcripts = dict()
    for ds_path in dst_paths:
        for dirpath, _, filenames in os.walk(ds_path):
            for filename in filenames:
                if not filename.endswith(".dot"):
                    continue
                full_path = os.path.join(dirpath, filename)
                subset = full_path.split(os.sep)[-3]
                assert subset, "Subset is empty"

                transcripts.setdefault(subset, dict())
                with open(full_path, "r") as f:
                    for line in f:
                        transcript, file_id = line.strip().rsplit(" ", 1)
                        file_id = file_id.strip("()")
                        if not transcript or not file_id:
                            continue

                        if subset in transcripts and file_id in transcripts[subset]:
                            assert (
                                transcripts[subset][file_id] == transcript
                            ), "different transcriptions available for {i}".format(
                                i=file_id
                            )
                        transcripts[subset][file_id] = transcript
    return transcripts