in tokenizers/src/models/bpe/trainer.rs [271:322]
fn compute_alphabet(
&self,
wc: &AHashMap<CompactString, u64>,
w2id: &mut AHashMap<CompactString, u32>,
id2w: &mut Vec<CompactString>,
) {
// Compute the alphabet from seen words
let mut alphabet: AHashMap<char, usize> = AHashMap::new();
for (word, count) in wc {
for c in word.chars() {
*alphabet.entry(c).or_default() += *count as usize;
}
}
// Also include anything from the provided initial alphabet
for c in &self.initial_alphabet {
*alphabet.entry(*c).or_default() = usize::MAX;
}
let mut kept = alphabet.iter().collect::<Vec<_>>();
// Compute the number of chars to remove from the alphabet
// If `limit_alphabet < initial_alphabet.len()`, some of these initial characters
// will be removed
let to_remove = self
.limit_alphabet
.map(|limit| alphabet.len().saturating_sub(limit))
.unwrap_or(0);
// Remove the unwanted chars
if to_remove > 0 {
kept.sort_unstable_by_key(|k| *k.1);
kept.drain(..to_remove);
}
// Keep the initial alphabet (sorted for determinism)
kept.sort_unstable_by_key(|k| *k.0 as u32);
kept.into_iter().for_each(|(c, _)| {
let s = c.to_string();
/*
if !w2id.contains_key(&s) {
id2w.push(s.clone());
w2id.insert(s, (id2w.len() - 1) as u32);
}
*/
// u64 hash version
if !w2id.contains_key(&CompactString::from(&s)) {
id2w.push(CompactString::from(&s));
w2id.insert(CompactString::from(&s), (id2w.len() - 1) as u32);
}
});
}