def check_encode()

in bindings/python/scripts/spm_parity_check.py [0:0]


def check_encode(args):
    sp = spm.SentencePieceProcessor()
    sp.Load(args.model_file)

    if args.from_spm:
        tok = tokenizers.SentencePieceUnigramTokenizer.from_spm(args.model_file)
    else:
        vocab = [(sp.id_to_piece(i), sp.get_score(i)) for i in range(sp.piece_size())]
        unk_id = sp.unk_id()
        tok = tokenizers.SentencePieceUnigramTokenizer(vocab, unk_id)

    perfect = 0
    imperfect = 0
    wrong = 0
    now = datetime.datetime.now
    spm_total_time = datetime.timedelta(seconds=0)
    tok_total_time = datetime.timedelta(seconds=0)
    with open(args.input_file, "r", encoding="utf-8-sig") as f:
        for i, line in enumerate(f):
            line = line.strip()

            start = now()
            ids = sp.EncodeAsIds(line)
            spm_time = now()

            encoded = tok.encode(line)
            tok_time = now()

            spm_total_time += spm_time - start
            tok_total_time += tok_time - spm_time

            if args.verbose:
                if i % 10000 == 0:
                    print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
                    print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")

            if ids != encoded.ids:
                if check_details(line, ids, encoded.ids, sp, tok):
                    imperfect += 1
                    continue
                else:
                    wrong += 1
            else:
                perfect += 1

            assert (
                ids == encoded.ids
            ), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"

    print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
    total = perfect + imperfect + wrong
    print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")