in tokenizers/src/models/bpe/model.rs [669:734]
fn test_tokenize_with_and_without_dropout() {
let vocab: Vocab = [
("u".into(), 0),
("n".into(), 1),
("r".into(), 2),
("e".into(), 3),
("l".into(), 4),
("a".into(), 5),
("t".into(), 6),
("d".into(), 7),
("re".into(), 8),
("at".into(), 9),
("ed".into(), 10),
("un".into(), 11),
("ated".into(), 12),
("rel".into(), 13),
("related".into(), 14),
("unrelated".into(), 15),
]
.iter()
.cloned()
.collect();
let merges: Merges = vec![
("r".to_string(), "e".to_string()),
("a".to_string(), "t".to_string()),
("e".to_string(), "d".to_string()),
("u".to_string(), "n".to_string()),
("at".to_string(), "ed".to_string()),
("re".to_string(), "l".to_string()),
("rel".to_string(), "ated".to_string()),
("un".to_string(), "related".to_string()),
];
let mut bpe = BPE::new(vocab, merges);
// With no dropout:
let tokens = bpe.tokenize("unrelated").unwrap();
assert_eq!(tokens, vec![Token::new(15u32, "unrelated".into(), (0, 9))]);
// With dropout = 0.0 (equivalent to dropout == none)
bpe.dropout = Some(0.0);
let tokens = bpe.tokenize("unrelated").unwrap();
assert_eq!(tokens, vec![Token::new(15u32, "unrelated".into(), (0, 9))]);
// Now set dropout to 1.0. Result should be no merges performed.
bpe.dropout = Some(1.0);
let tokens = bpe.tokenize("unrelated").unwrap();
assert_eq!(
tokens,
vec![
Token::new(0u32, "u".into(), (0, 1)),
Token::new(1u32, "n".into(), (1, 2)),
Token::new(2u32, "r".into(), (2, 3)),
Token::new(3u32, "e".into(), (3, 4)),
Token::new(4u32, "l".into(), (4, 5)),
Token::new(5u32, "a".into(), (5, 6)),
Token::new(6u32, "t".into(), (6, 7)),
Token::new(3u32, "e".into(), (7, 8)),
Token::new(7u32, "d".into(), (8, 9)),
]
);
// Now try with dropout between 0 and 1.
bpe.dropout = Some(0.5);
let tokens = bpe.tokenize("unrelated").unwrap();
assert!(!tokens.is_empty() && tokens.len() <= 9);
}