fn can_add_special_tokens()

in tokenizers/src/tokenizer/added_vocabulary.rs [678:751]


    fn can_add_special_tokens() {
        let model = ModelMock::new(&[("test", 0), ("tost", 1)]);
        let mut vocab = AddedVocabulary::new();
        let normalizer: Option<&NormalizerWrapper> = None;
        // Add tokens normally
        assert_eq!(
            vocab.add_special_tokens(
                &[AddedToken::from("added_token_1", true)],
                &model,
                normalizer
            ),
            1
        );
        assert_eq!(vocab.len(), 1);

        // Does not add multiple time the same token
        assert_eq!(
            vocab.add_special_tokens(
                &[
                    AddedToken::from("added_token_2", true),
                    AddedToken::from("added_token_2", true)
                ],
                &model,
                normalizer
            ),
            1
        );
        assert_eq!(vocab.len(), 2);

        // Can add tokens already covered by the model
        assert_eq!(
            vocab.add_special_tokens(&[AddedToken::from("test", true)], &model, normalizer),
            1
        );
        assert_eq!(vocab.len(), 3); // New token was added
        assert!(vocab.is_special_token("test"));
        assert_eq!(
            *vocab.get_added_tokens_decoder(),
            AHashMap::from([
                (0, AddedToken::from("test", true)),
                (2, AddedToken::from("added_token_1", true)),
                (3, AddedToken::from("added_token_2", true)),
            ])
        );
        assert!(vocab.added_tokens_map.contains_key("test"));
        assert!(vocab.added_tokens_map_r.contains_key(&0));

        vocab.add_tokens(
            &[
                AddedToken::from("tost", true),
                AddedToken::from("another_two", false),
            ],
            &model,
            normalizer,
        );
        assert_eq!(vocab.len(), 5); // New token was added
        assert_eq!(vocab.get_vocab()["another_two"], 4); // New token was added, but the index is not the length of the vocab

        // Let's add an already added token again
        assert_eq!(
            vocab.add_special_tokens(&[AddedToken::from("another_two", true)], &model, normalizer),
            1
        );
        assert_eq!(vocab.len(), 5); // Token was already there
        assert_eq!(vocab.get_vocab()["another_two"], 4); // Token idx not changed

        // Just checking that we can set the content of the string in rust
        let mut token: AddedToken = AddedToken::from("Hey", false);
        token.content = "hey".to_string();
        assert_eq!(token.content, "hey"); // Token was already there

        token.special = true;
        assert!(token.special); // Token was already there
    }