in bindings/python/scripts/spm_parity_check.py [0:0]
def main():
parser = ArgumentParser("SentencePiece parity checker")
parser.add_argument(
"--input-file",
"-i",
type=str,
required=True,
help="Which files do you want to train from",
)
parser.add_argument(
"--model-file",
"-m",
type=str,
required=False,
default=None,
help="Use a pretrained token file",
)
parser.add_argument(
"--model-prefix",
type=str,
default="spm_parity",
help="Model prefix for spm_train",
)
parser.add_argument(
"--vocab-size",
"-v",
type=int,
default=8000,
help="Vocab size for spm_train",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Verbosity",
)
parser.add_argument(
"--train",
action="store_true",
help="Instead of checking the encoder part, we check the trainer part",
)
parser.add_argument(
"--from-spm",
action="store_true",
help="Directly load the spm file with it's own normalizer",
)
args = parser.parse_args()
trained = False
if args.model_file is None:
spm.SentencePieceTrainer.Train(
f"--input={args.input_file} --model_prefix={args.model_prefix}"
f" --character_coverage=1.0"
f" --max_sentence_length=40000"
f" --num_threads=1"
f" --vocab_size={args.vocab_size}"
)
trained = True
args.model_file = f"{args.model_prefix}.model"
try:
if args.train:
check_train(args)
else:
check_encode(args)
finally:
if trained:
os.remove(f"{args.model_prefix}.model")
os.remove(f"{args.model_prefix}.vocab")