in tokenizers/src/models/unigram/model.rs [556:608]
fn test_encode2() {
let sentencepieces = vec![
("<unk>".to_string(), 0.0),
("ab".to_string(), 0.0),
("cd".to_string(), -0.1),
("abc".to_string(), -0.2),
("a".to_string(), -0.3),
("b".to_string(), -0.4),
("c".to_string(), -0.5),
("ABC".to_string(), -0.5),
("abcdabcd".to_string(), 20.0), // User defined just max the scores.
("q".to_string(), 20.5),
("r".to_string(), 20.5),
("qr".to_string(), -0.5),
];
let mut model = Unigram::from(sentencepieces, Some(0), false).unwrap();
for is_optimized in &[true, false] {
model.set_optimized(*is_optimized);
println!("IsOptimized {is_optimized:?}");
assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);
model.set_fuse_unk(false);
assert_eq!(model.encode("AB").unwrap(), vec!["A", "B"]);
model.set_fuse_unk(true);
assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);
assert_eq!(model.encode("abcd").unwrap(), vec!["ab", "cd"]);
assert_eq!(model.encode("abcc").unwrap(), vec!["abc", "c"]);
assert_eq!(
model.encode("xabcabaabcdd").unwrap(),
vec!["x", "abc", "ab", "a", "ab", "cd", "d"]
);
model.set_fuse_unk(false);
assert_eq!(
model.encode("xyz東京").unwrap(),
vec!["x", "y", "z", "東", "京"]
);
model.set_fuse_unk(true);
assert_eq!(model.encode("xyz東京").unwrap(), vec!["xyz東京"]);
// User encoded in original version
assert_eq!(model.encode("ABC").unwrap(), vec!["ABC"]);
assert_eq!(model.encode("abABCcd").unwrap(), vec!["ab", "ABC", "cd"]);
assert_eq!(
model.encode("ababcdabcdcd").unwrap(),
vec!["ab", "abcdabcd", "cd"]
);
assert_eq!(model.encode("abqrcd").unwrap(), vec!["ab", "q", "r", "cd"]);
}
}