def filter_transcripts()

in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]


def filter_transcripts(transcript_list, args):
    # fastpath
    if not args.filter:
        return transcript_list

    filtered_transcripts = []
    for transcript in transcript_list:
        good = True
        # skip transcripts with warnings
        if args.warnings:
            if transcript.warning:
                good = False
                if args.print_filtered_results:
                    eprint(
                        "Filtering predicted transcript (warning) "
                        + transcript.sid
                        + ": "
                        + transcript.prediction
                    )
                continue

        if args.ngram:
            plist = transcript.prediction.split(" ")
            # look for repeating n-grams
            ngrams = [" ".join(c) for c in compute_ngrams(plist, args.ngram_size)]
            for gram in ngrams:
                if transcript.prediction.count(gram) > args.ngram_appearance_threshold:
                    good = False
                    if args.print_filtered_results:
                        eprint(
                            "Filtering predicted transcript (ngram fail) "
                            + transcript.sid
                            + ": "
                            + transcript.prediction
                        )
                    break

        # passes all checks
        if good:
            filtered_transcripts.append(transcript)

    return filtered_transcripts