in learn_bpe.py [0:0]
def learn_bpe(infiles,
outfile,
num_symbols_ori,
probabilistic=False,
frac_stopping=None,
frac_stopping_average_n=100,
min_frequency=2,
is_dict=False,
total_symbols=False,
verbose=False):
"""Learn num_symbols BPE operations from vocabulary, and write to outfile.
"""
# version 0.2 changes the handling of the end-of-word token ('</w>');
# version numbering allows backward compatibility
# We should be compatible with the original 0.2 version
outfile.write('#version: 0.2\n')
vocab = VocabCounter(infiles, is_dict)
num_symbols = adapt_num_symbols(num_symbols_ori, vocab, total_symbols)
pair_stats = PairStats(vocab, probabilistic)
ini_score = None
frac_stopping_accum = 0.0
num_written = 0
for num_written in range(num_symbols):
if not pair_stats:
sys.stderr.write('No more pairs after creating {} symbols. Stopping\n'.format(num_written))
break
(freq, pair), score = pair_stats.pop_max()
if probabilistic and frac_stopping > 0.0:
frac_stopping_accum += score[0]
if num_written + 1 == frac_stopping_average_n:
ini_score = frac_stopping_accum / frac_stopping_average_n
frac_stopping_accum = 0.0
elif (num_written + 1) % frac_stopping_average_n == 0:
avg = frac_stopping_accum / frac_stopping_average_n
if avg < frac_stopping * ini_score:
sys.stderr.write('Stopping due to frac-stopping after %d symbols (%f < %f * %f)\n' %
(num_written, avg, frac_stopping, ini_score))
break
else:
frac_stopping_accum = 0.0
if not probabilistic and freq < min_frequency:
sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
break
if verbose:
sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(num_written, pair[0], pair[1], freq))
outfile.write('{0} {1}\n'.format(*pair))
sys.stderr.write(f"{num_written} pairs written to file.\n")
return num_written