def __init__()

in modules/SwissArmyTransformer/sat/tokenization/glm/tokenization.py [0:0]


    def __init__(self, model_type_or_path, cache_dir=None, add_block_symbols=False, add_task_mask=False,
                 add_decoder_mask=False, **kwargs):
        text_tokenizer = GPT2Tokenizer.from_pretrained(model_type_or_path,
                                                       cache_dir=cache_dir)

        # disable max len warnings by increasing max len
        text_tokenizer.max_len = int(1e12)
        num_tokens = len(text_tokenizer.encoder)
        if model_type_or_path.startswith('roberta'):
            command_tokens = [
                CommandToken('pad', '<|endoftext|>', text_tokenizer.encoder['</s>']),
                CommandToken('eos', '<|endoftext|>', text_tokenizer.encoder['</s>']),
                CommandToken('sep', '[SEP]', text_tokenizer.encoder['<pad>']),
                CommandToken('ENC', '[CLS]', text_tokenizer.encoder['<s>']),
                CommandToken('MASK', '[MASK]', text_tokenizer.encoder['<mask>'], lstrip=True),
                CommandToken('unk', '[UNK]', text_tokenizer.encoder['<unk>'])
            ]
            if add_block_symbols:
                command_tokens.extend([
                    CommandToken('sop', '<|startofpiece|>', num_tokens),
                    CommandToken('eop', '<|endofpiece|>', num_tokens + 1)
                ])
                num_tokens += 2
        else:
            command_tokens = [
                CommandToken('pad', '<|endoftext|>', text_tokenizer.encoder['<|endoftext|>']),
                CommandToken('eos', '<|endoftext|>', text_tokenizer.encoder['<|endoftext|>'])
            ]
            if add_block_symbols:
                command_tokens.extend([
                    CommandToken('sop', '<|startofpiece|>', num_tokens),
                    CommandToken('eop', '<|endofpiece|>', num_tokens + 1),
                    CommandToken('ENC', '[CLS]', num_tokens + 2),
                    CommandToken('MASK', '[MASK]', num_tokens + 3, lstrip=True),
                    CommandToken('sep', '[SEP]', num_tokens + 4),
                    CommandToken('unk', '[UNK]', num_tokens + 5)
                ])
                num_tokens += 6
        if add_block_symbols:
            if add_task_mask:
                command_tokens.extend([
                    CommandToken('gMASK', '[gMASK]', num_tokens, lstrip=True),
                    CommandToken('sMASK', '[sMASK]', num_tokens + 1, lstrip=True)
                ])
                num_tokens += 2
            if add_decoder_mask:
                command_tokens.extend([
                    CommandToken('dBLOCK', '[dBLOCK]', num_tokens)
                ])
                num_tokens += 1
        super().__init__(text_tokenizer, command_tokens=command_tokens)