in esm/data.py [0:0]
def tokenize(self, text, **kwargs) -> List[str]:
"""
Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
Converts a string in a sequence of tokens, using the tokenizer.
Args:
text (:obj:`str`):
The sequence to be encoded.
Returns:
:obj:`List[str]`: The list of tokens.
"""
def split_on_token(tok, text):
result = []
split_text = text.split(tok)
for i, sub_text in enumerate(split_text):
# AddedToken can control whitespace stripping around them.
# We use them for GPT2 and Roberta to have different behavior depending on the special token
# Cf. https://github.com/huggingface/transformers/pull/2778
# and https://github.com/huggingface/transformers/issues/3788
# We strip left and right by default
if i < len(split_text) - 1:
sub_text = sub_text.rstrip()
if i > 0:
sub_text = sub_text.lstrip()
if i == 0 and not sub_text:
result.append(tok)
elif i == len(split_text) - 1:
if sub_text:
result.append(sub_text)
else:
pass
else:
if sub_text:
result.append(sub_text)
result.append(tok)
return result
def split_on_tokens(tok_list, text):
if not text.strip():
return []
tokenized_text = []
text_list = [text]
for tok in tok_list:
tokenized_text = []
for sub_text in text_list:
if sub_text not in self.unique_no_split_tokens:
tokenized_text.extend(split_on_token(tok, sub_text))
else:
tokenized_text.append(sub_text)
text_list = tokenized_text
return list(
itertools.chain.from_iterable(
(
self._tokenize(token)
if token not in self.unique_no_split_tokens
else [token]
for token in tokenized_text
)
)
)
no_split_token = self.unique_no_split_tokens
tokenized_text = split_on_tokens(no_split_token, text)
return tokenized_text