utils_nlp/models/bert/common.py [122:150]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if max_len > BERT_MAX_LEN:
            print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
            max_len = BERT_MAX_LEN

        if isinstance(tokens[0][0], str):
            tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens]
            token_type_ids = None
        else:
            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]
            tokens = [
                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)
                for sentence in tokens
            ]

            # construct token_type_ids
            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
            token_type_ids = [
                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
            ]
            # merge sentences
            tokens = [[token for sentence in example for token in sentence] for example in tokens]
            # prefix with [0] for [CLS]
            token_type_ids = [
                [0] + [i for sentence in example for i in sentence] for example in token_type_ids
            ]
            # pad sequence
            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]

        tokens = [["[CLS]"] + x for x in tokens]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


utils_nlp/models/bert/common.py [179:207]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if max_len > BERT_MAX_LEN:
            print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
            max_len = BERT_MAX_LEN

        if isinstance(tokens[0][0], str):
            tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens]
            token_type_ids = None
        else:
            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]
            tokens = [
                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)
                for sentence in tokens
            ]

            # construct token_type_ids
            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
            token_type_ids = [
                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
            ]
            # merge sentences
            tokens = [[token for sentence in example for token in sentence] for example in tokens]
            # prefix with [0] for [CLS]
            token_type_ids = [
                [0] + [i for sentence in example for i in sentence] for example in token_type_ids
            ]
            # pad sequence
            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]

        tokens = [["[CLS]"] + x for x in tokens]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -