in python/dpu_utils/mlutils/bpevocabulary.py [0:0]
def tokenize(self, text: str) -> List[str]:
""" Tokenize a string. """
pieces = self.__sp_model.EncodeAsPieces(text)
new_pieces = [] # type: List[str]
for piece in pieces:
# Split subtokens composed of a digit and comma
#
# E.g. given in an input sentence:
# text = 'for i in range(100, 2):'
# Default output of tokenizer may be:
# ['▁for', '▁i', '▁in', '▁range', '(1', '00,', '▁2', '):']
# Following will change this to:
# ['▁for', '▁i', '▁in', '▁range', '(1', '0', '0', ',', '▁2', '):']
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
cur_pieces = self.__sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
return new_pieces