def learn_bpe()

in learn_bpe.py [0:0]


def learn_bpe(infiles,
              outfile,
              num_symbols_ori,
              probabilistic=False,
              frac_stopping=None,
              frac_stopping_average_n=100,
              min_frequency=2,
              is_dict=False,
              total_symbols=False,
              verbose=False):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """

    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows backward compatibility
    # We should be compatible with the original 0.2 version
    outfile.write('#version: 0.2\n')
    vocab = VocabCounter(infiles, is_dict)
    num_symbols = adapt_num_symbols(num_symbols_ori, vocab, total_symbols)
    pair_stats = PairStats(vocab, probabilistic)
    ini_score = None
    frac_stopping_accum = 0.0
    num_written = 0
    for num_written in range(num_symbols):
        if not pair_stats:
            sys.stderr.write('No more pairs after creating {} symbols. Stopping\n'.format(num_written))
            break
        (freq, pair), score = pair_stats.pop_max()

        if probabilistic and frac_stopping > 0.0:
            frac_stopping_accum += score[0]
            if num_written + 1 == frac_stopping_average_n:
                ini_score = frac_stopping_accum / frac_stopping_average_n
                frac_stopping_accum = 0.0
            elif (num_written + 1) % frac_stopping_average_n == 0:
                avg = frac_stopping_accum / frac_stopping_average_n
                if avg < frac_stopping * ini_score:
                    sys.stderr.write('Stopping due to frac-stopping after %d symbols (%f < %f * %f)\n' %
                                     (num_written, avg, frac_stopping, ini_score))
                    break
                else:
                    frac_stopping_accum = 0.0

        if not probabilistic and freq < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break
        if verbose:
            sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(num_written, pair[0], pair[1], freq))
        outfile.write('{0} {1}\n'.format(*pair))

    sys.stderr.write(f"{num_written} pairs written to file.\n")
    return num_written