pytorch_transformers/tokenization_openai.py [108:184]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + '',) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token+'' while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word)-1 and word[i+1] == second: new_word.append(first+second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) if word == '\n ': word = '\n' self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an id in a token (BPE) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = ''.join(tokens).replace('', ' ').strip() return out_string - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - pytorch_transformers/tokenization_xlm.py [144:220]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @property def vocab_size(self): return len(self.encoder) def bpe(self, token): word = tuple(token[:-1]) + (token[-1] + '',) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: return token+'' while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word)-1 and word[i+1] == second: new_word.append(first+second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) if word == '\n ': word = '\n' self.cache[token] = word return word def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (string/unicode) using the vocab.""" return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = ''.join(tokens).replace('', ' ').strip() return out_string - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -