in learn_bpe.py [0:0]
def __init__(self, vocab, probabilistic):
self.vocab = vocab
self.probabilistic = probabilistic
raw_stats = defaultdict(int) # From pairs to counts
self.vocab_entries_for_pair = defaultdict(set) # From pairs to lists of indices in vocab
for pos, word, freq in self.vocab:
word_pair_stats = self.get_pair_stats_from_word(word)
for pair, count in word_pair_stats.items():
raw_stats[pair] += count * freq
self.vocab_entries_for_pair[pair].add(pos)
# For probabilistic BPE we need the counts of the produced items
if self.probabilistic:
self.produced_count = defaultdict(int)
self.n_running_symbols = 0
# Store the counts of the initial units (characters)
for _, word, freq in self.vocab:
for unit in word:
self.produced_count[unit] += freq
self.n_running_symbols += freq
# stats_heap will contain pairs (freq, word)
self.stats_heap = HeapWithInverseIndex(value_score_function=self._get_scoring_function(),
use_score_caching=True,
key_function=lambda x: x[1])
for pair in [(i[1], i[0]) for i in raw_stats.items()]:
self.stats_heap.insert(pair)