def run()

in recipes/self_training/pseudo_labeling/generate_synthetic_data.py [0:0]


def run():
    parser = argparse.ArgumentParser(
        description="Converts decoder output into train-ready list-style"
        " dataset formats"
    )

    parser.add_argument(
        "-i",
        "--input",
        type=str,
        required=True,
        help="Path to decoder output containing transcripts",
    )
    parser.add_argument(
        "-p",
        "--listpath",
        type=str,
        required=True,
        help="Path of existing list file dataset or which to replace transcripts",
    )
    parser.add_argument(
        "-w",
        "--warnings",
        action="store_true",
        help="Remove transcripts with EOS warnings by default",
    )
    parser.add_argument(
        "-g",
        "--ngram",
        action="store_true",
        help="Remove transcripts with ngram issues",
    )
    parser.add_argument(
        "-n",
        "--ngram_appearance_threshold",
        type=int,
        required=False,
        default=4,
        help="The number of identical n-grams that must appear in a "
        "prediction for it to be thrown out",
    )
    parser.add_argument(
        "-s",
        "--ngram_size",
        type=int,
        required=False,
        default=2,
        help="The size of n-gram which will be used when searching for duplicates",
    )
    parser.add_argument(
        "-f", "--filter", action="store_true", help="Run some filtering criteria"
    )
    parser.add_argument(
        "-o", "--output", type=str, required=True, help="Output filepath"
    )
    parser.add_argument(
        "-d",
        "--distributed_decoding",
        action="store_true",
        help="Processing a combined transcript with distributed decoding",
    )
    parser.add_argument(
        "-v",
        "--print_filtered_results",
        type=bool,
        required=False,
        default=False,
        help="Print transcripts that are filtered based on filter criteria to stderr",
    )
    parser.add_argument(
        "-q",
        "--viterbi",
        action="store_true",
        help="Expects a transcript format that is consistent with a Viterbi run",
    )

    args = parser.parse_args()

    if not os.path.isfile(args.input):
        raise Exception("'" + args.input + "' - input file doesn't exist")
    if not os.path.isfile(args.listpath):
        raise Exception("'" + args.input + "' - listpath file doesn't exist")

    transcripts_predictions = create_transcript_set(
        args.input, args.viterbi, args.distributed_decoding
    )
    filtered_transcripts = filter_transcripts(transcripts_predictions, args)
    final_transcript_dict = pair_transcripts_with_existing_list(
        filtered_transcripts, args.listpath
    )
    write_transcript_list_to_file(final_transcript_dict, args.output)