in lama/modules/bert_connector.py [0:0]
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = btok.whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
# pass MASK forward
if MASK in token:
split_tokens.append(MASK)
if token != MASK:
remaining_chars = token.replace(MASK,"").strip()
if remaining_chars:
split_tokens.append(remaining_chars)
continue
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = btok.whitespace_tokenize(" ".join(split_tokens))
return output_tokens