in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]
def filter_transcripts(transcript_list, args):
# fastpath
if not args.filter:
return transcript_list
filtered_transcripts = []
for transcript in transcript_list:
good = True
# skip transcripts with warnings
if args.warnings:
if transcript.warning:
good = False
if args.print_filtered_results:
eprint(
"Filtering predicted transcript (warning) "
+ transcript.sid
+ ": "
+ transcript.prediction
)
continue
if args.ngram:
plist = transcript.prediction.split(" ")
# look for repeating n-grams
ngrams = [" ".join(c) for c in compute_ngrams(plist, args.ngram_size)]
for gram in ngrams:
if transcript.prediction.count(gram) > args.ngram_appearance_threshold:
good = False
if args.print_filtered_results:
eprint(
"Filtering predicted transcript (ngram fail) "
+ transcript.sid
+ ": "
+ transcript.prediction
)
break
# passes all checks
if good:
filtered_transcripts.append(transcript)
return filtered_transcripts