in tokenizers/src/tokenizer/added_vocabulary.rs [678:751]
fn can_add_special_tokens() {
let model = ModelMock::new(&[("test", 0), ("tost", 1)]);
let mut vocab = AddedVocabulary::new();
let normalizer: Option<&NormalizerWrapper> = None;
// Add tokens normally
assert_eq!(
vocab.add_special_tokens(
&[AddedToken::from("added_token_1", true)],
&model,
normalizer
),
1
);
assert_eq!(vocab.len(), 1);
// Does not add multiple time the same token
assert_eq!(
vocab.add_special_tokens(
&[
AddedToken::from("added_token_2", true),
AddedToken::from("added_token_2", true)
],
&model,
normalizer
),
1
);
assert_eq!(vocab.len(), 2);
// Can add tokens already covered by the model
assert_eq!(
vocab.add_special_tokens(&[AddedToken::from("test", true)], &model, normalizer),
1
);
assert_eq!(vocab.len(), 3); // New token was added
assert!(vocab.is_special_token("test"));
assert_eq!(
*vocab.get_added_tokens_decoder(),
AHashMap::from([
(0, AddedToken::from("test", true)),
(2, AddedToken::from("added_token_1", true)),
(3, AddedToken::from("added_token_2", true)),
])
);
assert!(vocab.added_tokens_map.contains_key("test"));
assert!(vocab.added_tokens_map_r.contains_key(&0));
vocab.add_tokens(
&[
AddedToken::from("tost", true),
AddedToken::from("another_two", false),
],
&model,
normalizer,
);
assert_eq!(vocab.len(), 5); // New token was added
assert_eq!(vocab.get_vocab()["another_two"], 4); // New token was added, but the index is not the length of the vocab
// Let's add an already added token again
assert_eq!(
vocab.add_special_tokens(&[AddedToken::from("another_two", true)], &model, normalizer),
1
);
assert_eq!(vocab.len(), 5); // Token was already there
assert_eq!(vocab.get_vocab()["another_two"], 4); // Token idx not changed
// Just checking that we can set the content of the string in rust
let mut token: AddedToken = AddedToken::from("Hey", false);
token.content = "hey".to_string();
assert_eq!(token.content, "hey"); // Token was already there
token.special = true;
assert!(token.special); // Token was already there
}