in modules/SwissArmyTransformer/sat/tokenization/glm/tokenization.py [0:0]
def __init__(self, text_tokenizer, command_tokens=None):
# set text tokenizer
self.text_tokenizer = text_tokenizer
if not hasattr(self, 'num_text_tokens'):
self.num_text_tokens = len(self.text_tokenizer)
self._command_tokens = command_tokens
self.command_name_map = {tok.name: tok for tok in self.command_tokens}
self.command_token_map = {tok.token: tok for tok in self.command_tokens}
self.command_id_map = {tok.Id: tok for tok in self.command_tokens}
# parse tokens and vocabs from tokenizer
max_token_id = max(len(self.text_tokenizer.tokens) - 1, max(self.command_id_map.keys()))
self._tokens = [self.text_tokenizer.tokens[i] if i < len(self.text_tokenizer.tokens) else f'[UNUSED{i}]' for i
in range(max_token_id + 1)]
for idx, token in self.command_id_map.items():
self._tokens[idx] = token.token
self._vocab = {t.token: Id for Id, t in self.command_id_map.items()}
self._vocab.update(self.text_tokenizer.vocab)
if not hasattr(self, 'num_command_tokens'):
self.num_command_tokens = len(self.command_tokens)
if not hasattr(self, 'num_tokens'):
self.num_tokens = len(self.tokens)
self._text_tokens = list(self.text_tokenizer.tokens)
self._text_token_vocab = {t: Id for t, Id in self.text_tokenizer.vocab.items()}
self._command_token_tokens = list(self.command_token_map.keys())
self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
self.spaces_between_special_tokens = True