def main()

in bindings/python/scripts/spm_parity_check.py [0:0]


def main():
    parser = ArgumentParser("SentencePiece parity checker")
    parser.add_argument(
        "--input-file",
        "-i",
        type=str,
        required=True,
        help="Which files do you want to train from",
    )
    parser.add_argument(
        "--model-file",
        "-m",
        type=str,
        required=False,
        default=None,
        help="Use a pretrained token file",
    )
    parser.add_argument(
        "--model-prefix",
        type=str,
        default="spm_parity",
        help="Model prefix for spm_train",
    )
    parser.add_argument(
        "--vocab-size",
        "-v",
        type=int,
        default=8000,
        help="Vocab size for spm_train",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Verbosity",
    )
    parser.add_argument(
        "--train",
        action="store_true",
        help="Instead of checking the encoder part, we check the trainer part",
    )
    parser.add_argument(
        "--from-spm",
        action="store_true",
        help="Directly load the spm file with it's own normalizer",
    )

    args = parser.parse_args()

    trained = False
    if args.model_file is None:
        spm.SentencePieceTrainer.Train(
            f"--input={args.input_file} --model_prefix={args.model_prefix}"
            f" --character_coverage=1.0"
            f" --max_sentence_length=40000"
            f" --num_threads=1"
            f" --vocab_size={args.vocab_size}"
        )
        trained = True
        args.model_file = f"{args.model_prefix}.model"

    try:
        if args.train:
            check_train(args)
        else:
            check_encode(args)
    finally:
        if trained:
            os.remove(f"{args.model_prefix}.model")
            os.remove(f"{args.model_prefix}.vocab")