in bindings/python/scripts/spm_parity_check.py [0:0]
def check_encode(args):
sp = spm.SentencePieceProcessor()
sp.Load(args.model_file)
if args.from_spm:
tok = tokenizers.SentencePieceUnigramTokenizer.from_spm(args.model_file)
else:
vocab = [(sp.id_to_piece(i), sp.get_score(i)) for i in range(sp.piece_size())]
unk_id = sp.unk_id()
tok = tokenizers.SentencePieceUnigramTokenizer(vocab, unk_id)
perfect = 0
imperfect = 0
wrong = 0
now = datetime.datetime.now
spm_total_time = datetime.timedelta(seconds=0)
tok_total_time = datetime.timedelta(seconds=0)
with open(args.input_file, "r", encoding="utf-8-sig") as f:
for i, line in enumerate(f):
line = line.strip()
start = now()
ids = sp.EncodeAsIds(line)
spm_time = now()
encoded = tok.encode(line)
tok_time = now()
spm_total_time += spm_time - start
tok_total_time += tok_time - spm_time
if args.verbose:
if i % 10000 == 0:
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")
if ids != encoded.ids:
if check_details(line, ids, encoded.ids, sp, tok):
imperfect += 1
continue
else:
wrong += 1
else:
perfect += 1
assert (
ids == encoded.ids
), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
total = perfect + imperfect + wrong
print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")