in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]
def create_transcript_set(inpath, viterbi=False, distributed_decoding=False):
with open(inpath, "r") as f:
if not distributed_decoding:
# first line is chronos job
f.readline()
predictions = []
while True:
# each glob has
# - actual transcript
# - predicted transcript
# - actual word pieces
# - predicted word pieces
transcript = f.readline()
# check if EOF
if not transcript:
break
# each set is four lines, unless there's a warning
warning = False
if "[WARNING]" in transcript:
transcript = f.readline() # read an extra line to compensate
warning = True
transcript = transcript[
transcript.find("|T|: ") + len("|T|: ") :
] # remove |T|:
predicted = f.readline() # predicted transcript
predicted = predicted[
predicted.find("|P|: ") + len("|P|: ") :
] # remove |P|:
if viterbi:
predicted = predicted.replace(" ", "").replace("_", " ")
transcript = transcript.replace(" ", "").replace("_", " ")
# if distributed_decoding:
# predicted = predicted[1:].replace("_", " ")
# if not viterbi:
# read wp
f.readline()
f.readline()
sample_info = f.readline()
if not sample_info.strip():
continue
sid = sample_info.split(" ")[1]
sid = sid[:-1]
predictions.append(
TranscriptPrediction(sid, predicted, transcript, warning)
)
return predictions