in tokenizers/src/models/unigram/trainer.rs [113:186]
fn finalize(&self, model: Unigram, required_chars: AHashSet<String>) -> Result<Unigram> {
let mut min_score_penalty = 0.0;
let min_score_penalty_delta = 0.0001;
let mut pieces: Vec<(String, f64)> = vec![];
let mut inserted: AHashSet<String> = AHashSet::new();
// We don't want to include the <UNK> that was used to train
inserted.insert("<UNK>".into());
let existing_pieces: AHashMap<String, f64> = model.iter().cloned().collect();
for c in required_chars {
if let Some(t) = existing_pieces.get(&c) {
inserted.insert(c.clone());
pieces.push((c, *t));
} else {
let score = model.min_score + min_score_penalty;
inserted.insert(c.clone());
pieces.push((c, score));
min_score_penalty += min_score_penalty_delta;
}
}
let (unk_id, need_add_unk) = if let Some(ref unk) = self.unk_token {
let unk_id = self.special_tokens.iter().enumerate().find_map(|(i, t)| {
if t.content == *unk {
Some(i)
} else {
None
}
});
match unk_id {
Some(id) => (Some(id), false),
None => (Some(0), true),
}
} else {
(None, false)
};
let vocab_size_without_special_tokens = if need_add_unk {
self.vocab_size as usize - self.special_tokens.len() - 1
} else {
self.vocab_size as usize - self.special_tokens.len()
};
for (token, score) in model.iter() {
if inserted.contains::<str>(token) {
continue;
}
inserted.insert(token.to_string());
pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score }));
if pieces.len() == vocab_size_without_special_tokens {
break;
}
}
pieces.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
// Insert the necessary tokens
let mut special_tokens = self
.special_tokens
.iter()
.map(|t| (t.content.clone(), 0.0))
.collect::<Vec<_>>();
if need_add_unk {
special_tokens.insert(0, (self.unk_token.clone().unwrap(), 0.0));
}
Unigram::from(
special_tokens.into_iter().chain(pieces).collect(),
unk_id,
model.byte_fallback(),
)
}