in tokenizers/src/models/bpe/trainer.rs [325:373]
fn tokenize_words(
&self,
wc: &AHashMap<CompactString, u64>,
w2id: &mut AHashMap<CompactString, u32>,
id2w: &mut Vec<CompactString>,
p: &Option<ProgressBar>,
) -> (Vec<Word>, Vec<u64>) {
let mut words: Vec<Word> = Vec::with_capacity(wc.len());
let mut counts: Vec<u64> = Vec::with_capacity(wc.len());
for (word, count) in wc {
let mut current_word = Word::new();
counts.push(*count);
for (is_first, is_last, c) in word.chars().with_first_and_last() {
let mut s = c.to_string();
if w2id.contains_key(&CompactString::from(&s)) {
// Found the initial char in the authorized alphabet
// Add the `continuing_subword_prefix` if relevant
if !is_first {
if let Some(prefix) = &self.continuing_subword_prefix {
s.insert_str(0, prefix);
}
}
// Add the `end_of_word_suffix` if relevant
if is_last {
if let Some(suffix) = &self.end_of_word_suffix {
s.push_str(suffix);
}
}
// Insert the new formed string if necessary
if !w2id.contains_key(&CompactString::from(&s)) {
id2w.push(CompactString::from(&s));
w2id.insert(CompactString::from(&s), (id2w.len() - 1) as u32);
}
current_word.add(w2id[&CompactString::from(&s)], 1); // We do not care about the len here
}
}
words.push(current_word);
if let Some(p) = p {
p.inc(1);
}
}
(words, counts)
}