in learn_bpe.py [0:0]
def _update_stats(self, new_pair, old_word, new_word, freq, w_index, stats_changes):
"""
Update the statistics after merging the pair new_pair.
In this implementation we take the easy way, compute the stats for the
old and the new word, compare them and adapt accordingly. This allows
for an easy implementation and accomodates the slight difference
between probabilistic and non-probabilistic counts without effort (see
self.get_pair_stats_from_word for the mismatch).
"""
filter_set = set([new_pair[0], new_pair[1], new_pair[0] + new_pair[1]])
old_pair_stats = self.get_pair_stats_from_word(old_word, filter_set)
new_pair_stats = self.get_pair_stats_from_word(new_word, filter_set)
all_pairs = set(old_pair_stats.keys()) | set(new_pair_stats.keys())
for pair in sorted(list(all_pairs)):
if pair == new_pair:
continue
freq_change = None
if pair in new_pair_stats and pair not in old_pair_stats:
# New pair for this word
count = new_pair_stats[pair]
freq_change = count * freq
self.vocab_entries_for_pair[pair].add(w_index)
elif pair in old_pair_stats and pair not in new_pair_stats:
# The pair does not exist any more in this word
count = old_pair_stats[pair]
freq_change = -count * freq
# The next line is conceptually correct, but triggers an
# error that the set changed during iteration
#~ self.vocab_entries_for_pair[pair].remove(w_index)
else:
# The pair is both in the new and old words
count_diff = new_pair_stats[pair] - old_pair_stats[pair]
if count_diff != 0:
freq_change = count_diff * freq
if freq_change:
if pair in stats_changes:
stats_changes[pair] += freq_change
else:
stats_changes[pair] = freq_change