in tokenizers/src/models/unigram/trainer.rs [735:792]
fn test_unk_token() {
// 1. Should add `unk_token` as first special token
let trainer = UnigramTrainerBuilder::default()
.show_progress(false)
.special_tokens(vec![
AddedToken::from("[SEP]", true),
AddedToken::from("[CLS]", true),
])
.unk_token(Some("[UNK]".into()))
.build()
.unwrap();
let mut unigram = Unigram::default();
trainer
.do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
.unwrap();
let mut pieces = unigram.iter();
assert_eq!(pieces.next(), Some(&("[UNK]".into(), 0.0)));
assert_eq!(pieces.next(), Some(&("[SEP]".into(), 0.0)));
assert_eq!(pieces.next(), Some(&("[CLS]".into(), 0.0)));
// 2. Let it where it is
let trainer = UnigramTrainerBuilder::default()
.show_progress(false)
.special_tokens(vec![
AddedToken::from("[SEP]", true),
AddedToken::from("[CLS]", true),
AddedToken::from("[UNK]", true),
])
.unk_token(Some("[UNK]".into()))
.build()
.unwrap();
let mut unigram = Unigram::default();
trainer
.do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
.unwrap();
let mut pieces = unigram.iter();
assert_eq!(pieces.next(), Some(&("[SEP]".into(), 0.0)));
assert_eq!(pieces.next(), Some(&("[CLS]".into(), 0.0)));
assert_eq!(pieces.next(), Some(&("[UNK]".into(), 0.0)));
// 3. Don't put it there if not needed
let trainer = UnigramTrainerBuilder::default()
.show_progress(false)
.build()
.unwrap();
let mut unigram = Unigram::default();
trainer
.do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
.unwrap();
let mut pieces = unigram.iter();
assert_eq!(pieces.next().unwrap().0, "e".to_string());
}