pytorch-transformers/pytorch_transformers/tokenization_openai.py [111:156]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @property def vocab_size(self): return len(self.encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + '',) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token+'' while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word)-1 and word[i+1] == second: new_word.append(first+second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) if word == '\n ': word = '\n' self.cache[token] = word return word - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - pytorch-transformers/pytorch_transformers/tokenization_xlm.py [611:656]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @property def vocab_size(self): return len(self.encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + '',) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token+'' while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word)-1 and word[i+1] == second: new_word.append(first+second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) if word == '\n ': word = '\n' self.cache[token] = word return word - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -