def create_transcript_set()

in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]


def create_transcript_set(inpath, viterbi=False, distributed_decoding=False):
    with open(inpath, "r") as f:
        if not distributed_decoding:
            # first line is chronos job
            f.readline()

        predictions = []
        while True:
            # each glob has
            # - actual transcript
            # - predicted transcript
            # - actual word pieces
            # - predicted word pieces
            transcript = f.readline()
            # check if EOF
            if not transcript:
                break
            # each set is four lines, unless there's a warning
            warning = False
            if "[WARNING]" in transcript:
                transcript = f.readline()  # read an extra line to compensate
                warning = True
            transcript = transcript[
                transcript.find("|T|: ") + len("|T|: ") :
            ]  # remove |T|:
            predicted = f.readline()  # predicted transcript
            predicted = predicted[
                predicted.find("|P|: ") + len("|P|: ") :
            ]  # remove |P|:
            if viterbi:
                predicted = predicted.replace(" ", "").replace("_", " ")
                transcript = transcript.replace(" ", "").replace("_", " ")
            # if distributed_decoding:
            #     predicted = predicted[1:].replace("_", " ")

            # if not viterbi:
            # read wp
            f.readline()
            f.readline()
            sample_info = f.readline()
            if not sample_info.strip():
                continue
            sid = sample_info.split(" ")[1]
            sid = sid[:-1]
            predictions.append(
                TranscriptPrediction(sid, predicted, transcript, warning)
            )

        return predictions