def tokenize()

in python/dpu_utils/mlutils/bpevocabulary.py [0:0]


    def tokenize(self, text: str) -> List[str]:
        """ Tokenize a string. """
        pieces = self.__sp_model.EncodeAsPieces(text)

        new_pieces = []   # type: List[str]
        for piece in pieces:
            # Split subtokens composed of a digit and comma
            #
            # E.g. given in an input sentence: 
            #      text = 'for i in range(100, 2):'
            # Default output of tokenizer may be: 
            #      ['▁for', '▁i', '▁in', '▁range', '(1', '00,', '▁2', '):']
            # Following will change this to: 
            #      ['▁for', '▁i', '▁in', '▁range', '(1', '0', '0', ',', '▁2', '):']            
            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
                cur_pieces = self.__sp_model.EncodeAsPieces(
                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces