in tokenizers/src/tokenizer/added_vocabulary.rs [941:1000]
fn test_encode_special_tokens() {
let model = ModelMock::new(&[]);
let mut vocab = AddedVocabulary::new();
let normalizer = Lowercase;
vocab.add_tokens(
&[
AddedToken::from("<mask>", true)
.lstrip(true)
.rstrip(true)
.single_word(true),
AddedToken::from("ask>", false),
AddedToken::from("<pad>", true),
],
&model,
Some(&normalizer),
);
vocab.set_encode_special_tokens(true);
let result = vocab.extract_and_normalize(
Some(&normalizer),
"Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
);
assert_eq!(
simplify_output(&result),
vec![
("hi <m", None),
("ask>", Some(vec![1])),
(" there\t<m", None),
("ask>", Some(vec![1])),
("\t<m", None),
("ask>", Some(vec![1])),
("\u{2000} <pad> <m", None),
("ask>", Some(vec![1])),
("<pad><pad>", None)
]
);
vocab.set_encode_special_tokens(false);
let result = vocab.extract_and_normalize(
Some(&normalizer),
"Hi <mask> there\t<mask>\t<mask>\u{2000} <pad> <mask><pad><pad>",
);
assert_eq!(
simplify_output(&result),
vec![
("hi", None),
(" <mask> ", Some(vec![0])),
("there", None),
("\t<mask>\t", Some(vec![0])),
("<mask>\u{2000} ", Some(vec![0])),
("<pad>", Some(vec![2])),
(" <mask>", Some(vec![0])),
("<pad>", Some(vec![2])),
("<pad>", Some(vec![2]))
]
);
}