in modules/SwissArmyTransformer/sat/tokenization/glm/tokenization.py [0:0]
def __init__(self, model_type_or_path, cache_dir=None, add_block_symbols=False, add_task_mask=False,
add_decoder_mask=False, **kwargs):
text_tokenizer = GPT2Tokenizer.from_pretrained(model_type_or_path,
cache_dir=cache_dir)
# disable max len warnings by increasing max len
text_tokenizer.max_len = int(1e12)
num_tokens = len(text_tokenizer.encoder)
if model_type_or_path.startswith('roberta'):
command_tokens = [
CommandToken('pad', '<|endoftext|>', text_tokenizer.encoder['</s>']),
CommandToken('eos', '<|endoftext|>', text_tokenizer.encoder['</s>']),
CommandToken('sep', '[SEP]', text_tokenizer.encoder['<pad>']),
CommandToken('ENC', '[CLS]', text_tokenizer.encoder['<s>']),
CommandToken('MASK', '[MASK]', text_tokenizer.encoder['<mask>'], lstrip=True),
CommandToken('unk', '[UNK]', text_tokenizer.encoder['<unk>'])
]
if add_block_symbols:
command_tokens.extend([
CommandToken('sop', '<|startofpiece|>', num_tokens),
CommandToken('eop', '<|endofpiece|>', num_tokens + 1)
])
num_tokens += 2
else:
command_tokens = [
CommandToken('pad', '<|endoftext|>', text_tokenizer.encoder['<|endoftext|>']),
CommandToken('eos', '<|endoftext|>', text_tokenizer.encoder['<|endoftext|>'])
]
if add_block_symbols:
command_tokens.extend([
CommandToken('sop', '<|startofpiece|>', num_tokens),
CommandToken('eop', '<|endofpiece|>', num_tokens + 1),
CommandToken('ENC', '[CLS]', num_tokens + 2),
CommandToken('MASK', '[MASK]', num_tokens + 3, lstrip=True),
CommandToken('sep', '[SEP]', num_tokens + 4),
CommandToken('unk', '[UNK]', num_tokens + 5)
])
num_tokens += 6
if add_block_symbols:
if add_task_mask:
command_tokens.extend([
CommandToken('gMASK', '[gMASK]', num_tokens, lstrip=True),
CommandToken('sMASK', '[sMASK]', num_tokens + 1, lstrip=True)
])
num_tokens += 2
if add_decoder_mask:
command_tokens.extend([
CommandToken('dBLOCK', '[dBLOCK]', num_tokens)
])
num_tokens += 1
super().__init__(text_tokenizer, command_tokens=command_tokens)