in learn_bpe.py [0:0]
def get_pair_stats_from_word(self, word, filter_elems=None):
"""
Computes the statistics for pairs in a word. If filter_elems is given,
only pairs involving these elements are returned.
Note that there is a mismatch between standard and probabilistic BPE.
Consider as an example the word '10002'. The standard BPE
implementation reports 2 occurences of the pair '00', but when creating
the split, the result would be '1@@ 00@@ 0@@ 2', i.e. the pair '00'
appears only once. For standard BPE extraction we do not correct this
in order to stay compatible with the original implementation.
For probabilistic BPE, as we need the counts of the produced elements,
this mismatch has to be corrected in order to avoid negative
probabilities.
"""
word_len = len(word)
idx = 1
pair_stats = defaultdict(int)
while idx < word_len:
prev_unit = word[idx - 1]
unit = word[idx]
pair = (prev_unit, unit)
if filter_elems is None or (prev_unit in filter_elems or unit in filter_elems):
pair_stats[pair] += 1
if self.probabilistic:
# If we have three consecutive equal elements, skip the next one
if prev_unit == unit and idx < word_len - 1 and unit == word[idx + 1]:
idx += 2
else:
idx += 1
else:
idx += 1
return pair_stats