in tokenizers/src/models/wordlevel/trainer.rs [134:181]
fn test_train() {
let word_counts: AHashMap<String, u64> = [
("the".into(), 25),
("roses".into(), 22),
("are".into(), 24),
("red".into(), 12),
("voilets".into(), 10),
("blue".into(), 16),
]
.iter()
.cloned()
.collect();
let mut trainer = WordLevelTrainer {
vocab_size: 5,
..Default::default()
};
let mut model = WordLevel::default();
trainer.do_train(&word_counts, &mut model).unwrap();
let expected_vocab: AHashMap<String, u32> = [
("the".into(), 0),
("are".into(), 1),
("roses".into(), 2),
("blue".into(), 3),
("red".into(), 4),
]
.iter()
.cloned()
.collect();
assert_eq!(model.vocab, expected_vocab);
// If we specify a min_frequency
trainer.min_frequency = 15;
let mut model = WordLevel::default();
trainer.do_train(&word_counts, &mut model).unwrap();
let expected_vocab: AHashMap<String, u32> = [
("the".into(), 0),
("are".into(), 1),
("roses".into(), 2),
("blue".into(), 3),
]
.iter()
.cloned()
.collect();
assert_eq!(model.vocab, expected_vocab);
}