in learn_bpe.py [0:0]
def adapt_num_symbols(num_symbols, vocab, total_symbols):
"""
Handle the parameter --total_symbols.
"""
new_num_symbols = num_symbols
if total_symbols:
uniq_char_internal = set()
uniq_char_final = set()
for _, word, _ in vocab:
for char in word[:-1]:
uniq_char_internal.add(char)
uniq_char_final.add(word[-1])
sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal)))
sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final)))
sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final)))
new_num_symbols -= len(uniq_char_internal) + len(uniq_char_final)
return new_num_symbols