fn finalize()

in tokenizers/src/models/unigram/trainer.rs [113:186]


    fn finalize(&self, model: Unigram, required_chars: AHashSet<String>) -> Result<Unigram> {
        let mut min_score_penalty = 0.0;
        let min_score_penalty_delta = 0.0001;

        let mut pieces: Vec<(String, f64)> = vec![];
        let mut inserted: AHashSet<String> = AHashSet::new();

        // We don't want to include the <UNK> that was used to train
        inserted.insert("<UNK>".into());

        let existing_pieces: AHashMap<String, f64> = model.iter().cloned().collect();
        for c in required_chars {
            if let Some(t) = existing_pieces.get(&c) {
                inserted.insert(c.clone());
                pieces.push((c, *t));
            } else {
                let score = model.min_score + min_score_penalty;

                inserted.insert(c.clone());
                pieces.push((c, score));
                min_score_penalty += min_score_penalty_delta;
            }
        }

        let (unk_id, need_add_unk) = if let Some(ref unk) = self.unk_token {
            let unk_id = self.special_tokens.iter().enumerate().find_map(|(i, t)| {
                if t.content == *unk {
                    Some(i)
                } else {
                    None
                }
            });
            match unk_id {
                Some(id) => (Some(id), false),
                None => (Some(0), true),
            }
        } else {
            (None, false)
        };

        let vocab_size_without_special_tokens = if need_add_unk {
            self.vocab_size as usize - self.special_tokens.len() - 1
        } else {
            self.vocab_size as usize - self.special_tokens.len()
        };
        for (token, score) in model.iter() {
            if inserted.contains::<str>(token) {
                continue;
            }
            inserted.insert(token.to_string());
            pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score }));

            if pieces.len() == vocab_size_without_special_tokens {
                break;
            }
        }
        pieces.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());

        // Insert the necessary tokens
        let mut special_tokens = self
            .special_tokens
            .iter()
            .map(|t| (t.content.clone(), 0.0))
            .collect::<Vec<_>>();
        if need_add_unk {
            special_tokens.insert(0, (self.unk_token.clone().unwrap(), 0.0));
        }

        Unigram::from(
            special_tokens.into_iter().chain(pieces).collect(),
            unk_id,
            model.byte_fallback(),
        )
    }