in tokenizers/src/models/unigram/model.rs [331:356]
fn encode_unoptimized(&self, sentence: &str) -> Result<Vec<String>> {
let mut lattice = Lattice::from(sentence, self.bos_id, self.eos_id);
self.populate_nodes(&mut lattice);
if self.fuse_unk {
let mut results = vec![];
let mut token = String::new();
for node in lattice.viterbi().iter() {
let item = lattice.piece(&node.borrow());
if node.borrow().id == self.unk_id.ok_or(UnigramError::MissingUnkId)? {
token.push_str(&item);
} else {
if !token.is_empty() {
results.push(token);
token = String::new();
}
results.push(item);
}
}
if !token.is_empty() {
results.push(token);
}
Ok(results)
} else {
Ok(lattice.tokens())
}
}