def get_pair_stats_from_word()

in learn_bpe.py [0:0]


    def get_pair_stats_from_word(self, word, filter_elems=None):
        """
        Computes the statistics for pairs in a word. If filter_elems is given,
        only pairs involving these elements are returned.

        Note that there is a mismatch between standard and probabilistic BPE.
        Consider as an example the word '10002'. The standard BPE
        implementation reports 2 occurences of the pair '00', but when creating
        the split, the result would be '1@@ 00@@ 0@@ 2', i.e. the pair '00'
        appears only once. For standard BPE extraction we do not correct this
        in order to stay compatible with the original implementation.

        For probabilistic BPE, as we need the counts of the produced elements,
        this mismatch has to be corrected in order to avoid negative
        probabilities.
        """
        word_len = len(word)
        idx = 1
        pair_stats = defaultdict(int)
        while idx < word_len:
            prev_unit = word[idx - 1]
            unit = word[idx]
            pair = (prev_unit, unit)
            if filter_elems is None or (prev_unit in filter_elems or unit in filter_elems):
                pair_stats[pair] += 1
            if self.probabilistic:
                # If we have three consecutive equal elements, skip the next one
                if prev_unit == unit and idx < word_len - 1 and unit == word[idx + 1]:
                    idx += 2
                else:
                    idx += 1
            else:
                idx += 1
        return pair_stats