in tokenizers/src/models/unigram/trainer.rs [492:528]
fn run_m_step(&self, pieces: &[SentencePiece], expected: &[f64]) -> Vec<SentencePiece> {
if pieces.len() != expected.len() {
panic!(
"Those two iterators are supposed to be the same length ({} vs {})",
pieces.len(),
expected.len()
);
}
let mut new_pieces: Vec<SentencePiece> =
Vec::with_capacity(self.vocab_size.try_into().unwrap());
let mut sum = 0.0;
let expected_frequency_threshold = 0.5;
for (i, (freq, (piece, _score))) in expected.iter().zip(pieces).enumerate() {
// Always keep unk.
if i == 0 {
new_pieces.push((piece.clone(), f64::NAN));
continue;
}
if *freq < expected_frequency_threshold {
continue;
}
new_pieces.push((piece.clone(), *freq));
sum += freq;
}
// // Here we do not use the original EM, but use the
// // Bayesianified/DPified EM algorithm.
// // https://cs.stanford.edu/~pliang/papers/tutorial-acl2007-talk.pdf
// // This modification will act as a sparse prior.
let logsum = digamma(sum);
let new_pieces: Vec<_> = new_pieces
.into_iter()
.map(|(s, c)| (s, digamma(c) - logsum))
.collect();
new_pieces
}