vision/m4/utils/check_valid_tokenizer.py (10 lines of code) (raw):

def check_valid_tokenizer(tokenizer) -> bool: """Check if the special tokens were correctly added to the tokenizer, and if they are not normalized. """ tok_class = type(tokenizer).__name__.lower() if ("idefics" in tok_class) or ("mistral" in tok_class): assert "<image>" in tokenizer.get_vocab() assert "<fake_token_around_image>" in tokenizer.get_vocab() assert "<s>" in tokenizer.get_vocab() assert "</s>" in tokenizer.get_vocab() assert "<unk>" in tokenizer.get_vocab() for _, val in tokenizer.added_tokens_decoder.items(): assert not val.normalized # assert that normalized=False for all AddedToken