def preprocess_encoder_tokens()

in utils_nlp/models/bert/common.py [0:0]


    def preprocess_encoder_tokens(self, tokens, max_len=BERT_MAX_LEN):
        """Preprocessing of input tokens:
            - add BERT sentence markers ([CLS] and [SEP])
            - map tokens to token indices in the BERT vocabulary
            - pad and truncate sequences
            - create an input_mask
            - create token type ids, aka. segment ids

        Args:
            tokens (list): List of token lists to preprocess.
            max_len (int, optional): Maximum number of tokens
                            (documents will be truncated or padded).
                            Defaults to 512.
        Returns:
            tuple: A tuple containing the following four lists
                list of preprocesssed token lists
                list of input id lists
                list of input mask lists
                list of token type id lists
        """
        if max_len > BERT_MAX_LEN:
            print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
            max_len = BERT_MAX_LEN

        if isinstance(tokens[0][0], str):
            tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens]
            token_type_ids = None
        else:
            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]
            tokens = [
                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)
                for sentence in tokens
            ]

            # construct token_type_ids
            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
            token_type_ids = [
                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
            ]
            # merge sentences
            tokens = [[token for sentence in example for token in sentence] for example in tokens]
            # prefix with [0] for [CLS]
            token_type_ids = [
                [0] + [i for sentence in example for i in sentence] for example in token_type_ids
            ]
            # pad sequence
            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]

        tokens = [["[CLS]"] + x for x in tokens]
        # convert tokens to indices
        input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]
        # pad sequence
        input_ids = [x + [0] * (max_len - len(x)) for x in input_ids]
        # create input mask
        input_mask = [[min(1, x) for x in y] for y in input_ids]
        return tokens, input_ids, input_mask, token_type_ids