in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]
def run():
parser = argparse.ArgumentParser(
description="Converts decoder output into train-ready list-style"
" dataset formats"
)
parser.add_argument(
"-i",
"--input",
type=str,
required=True,
help="Path to decoder output containing transcripts",
)
parser.add_argument(
"-p",
"--listpath",
type=str,
required=True,
help="Path of existing list file dataset or which to replace transcripts",
)
parser.add_argument(
"-w",
"--warnings",
action="store_true",
help="Remove transcripts with EOS warnings by default",
)
parser.add_argument(
"-g",
"--ngram",
action="store_true",
help="Remove transcripts with ngram issues",
)
parser.add_argument(
"-n",
"--ngram_appearance_threshold",
type=int,
required=False,
default=4,
help="The number of identical n-grams that must appear in a "
"prediction for it to be thrown out",
)
parser.add_argument(
"-s",
"--ngram_size",
type=int,
required=False,
default=2,
help="The size of n-gram which will be used when searching for duplicates",
)
parser.add_argument(
"-f", "--filter", action="store_true", help="Run some filtering criteria"
)
parser.add_argument(
"-o", "--output", type=str, required=True, help="Output filepath"
)
parser.add_argument(
"-d",
"--distributed_decoding",
action="store_true",
help="Processing a combined transcript with distributed decoding",
)
parser.add_argument(
"-v",
"--print_filtered_results",
type=bool,
required=False,
default=False,
help="Print transcripts that are filtered based on filter criteria to stderr",
)
parser.add_argument(
"-q",
"--viterbi",
action="store_true",
help="Expects a transcript format that is consistent with a Viterbi run",
)
args = parser.parse_args()
if not os.path.isfile(args.input):
raise Exception("'" + args.input + "' - input file doesn't exist")
if not os.path.isfile(args.listpath):
raise Exception("'" + args.input + "' - listpath file doesn't exist")
transcripts_predictions = create_transcript_set(
args.input, args.viterbi, args.distributed_decoding
)
filtered_transcripts = filter_transcripts(transcripts_predictions, args)
final_transcript_dict = pair_transcripts_with_existing_list(
filtered_transcripts, args.listpath
)
write_transcript_list_to_file(final_transcript_dict, args.output)