def tokenize()

in src/datatuner/lm/custom_tokenizer.py [0:0]


def tokenize(self, text, **kwargs):
    """ Converts a string in a sequence of tokens (string), using the tokenizer.
            Split in words for word-based vocabulary or sub-words for sub-word-based
            vocabularies (BPE/SentencePieces/WordPieces).

            Take care of added tokens.
        """

    def split_on_token(tok, text):
        result = []
        split_text = text.split(tok)
        for i, sub_text in enumerate(split_text):
            sub_text = sub_text.strip()
            if i == 0 and not sub_text:
                result += [tok]
            elif i == len(split_text) - 1:
                if sub_text:
                    result += [sub_text]
                else:
                    pass
            else:
                if sub_text:
                    result += [sub_text]
                result += [tok]
        return result

    def split_on_tokens(tok_list, text):
        if not text:
            return []
        if not tok_list:
            return self._tokenize(text, **kwargs)

        tokenized_text = []
        text_list = [text]
        for tok in tok_list:
            tokenized_text = []
            for sub_text in text_list:
                if sub_text not in self.added_tokens_encoder and sub_text not in self.all_special_tokens:
                    tokenized_text += split_on_token(tok, sub_text)
                else:
                    tokenized_text += [sub_text]
            text_list = tokenized_text

        return sum(
            (
                self._tokenize(token, **kwargs)
                if token not in self.added_tokens_encoder and token not in self.all_special_tokens
                else [token]
                for token in tokenized_text
            ),
            [],
        )

    def get_special_tokens(s):
        candidates = get_custom_tags(s)
        return [cand for cand in candidates if cand in self.added_tokens_encoder.keys()]

    # The below becomes very slow when we scale to thousands of special tokens (e.g. many node types/predicates)
    # self.added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens

    all_added = list(self.added_tokens_encoder.keys())

    TOO_LARGE_NUM_TOKENS_THRESHOLD = 10
    # If we have a large number of special tokens, our current hack is to use task specific regexes to decide
    # candidates from the sentence first, and then match these candidates with the special tokens in the encoder.
    # That way we reduce the number of special tokens per iteration to a handful instead of thousands.
    if len(all_added) > TOO_LARGE_NUM_TOKENS_THRESHOLD:
        current_added_tokens = get_special_tokens(text)
    else:
        # Otherwise, we simply take all the added tokens, as is the original case in the library
        current_added_tokens = all_added

    added_tokens = current_added_tokens + self.all_special_tokens

    tokenized_text = split_on_tokens(added_tokens, text)
    return tokenized_text