in tokenizers/src/models/unigram/trainer.rs [196:276]
fn make_seed_sentence_pieces(
&self,
sentences: &[Sentence],
_progress: &Option<ProgressBar>,
) -> Vec<SentencePiece> {
// Put all sentences in a string, separated by \0
let total: usize = sentences
.iter()
.map(|(s, _)| s.chars().count())
.sum::<usize>()
+ sentences.len();
let mut flat_string = String::with_capacity(total);
let mut all_chars: AHashMap<char, u32> = AHashMap::new();
let c_sentence_boundary = '\0';
let k_sentence_boundary = '\0'.to_string();
for (string, n) in sentences {
if string.is_empty() {
continue;
}
flat_string.push_str(string);
// XXX
// Comment suggests we add sentence boundary, but it seems to be missing from actual
// code in spm.
flat_string.push_str(&k_sentence_boundary);
for c in string.chars() {
if c != c_sentence_boundary {
*all_chars.entry(c).or_insert(0) += n;
}
}
}
flat_string.shrink_to_fit();
#[cfg(feature = "esaxx_fast")]
let suffix = esaxx_rs::suffix(&flat_string).unwrap();
#[cfg(not(feature = "esaxx_fast"))]
let suffix = esaxx_rs::suffix_rs(&flat_string).unwrap();
// Basic chars need to be in sentence pieces.
let mut seed_sentencepieces: Vec<SentencePiece> = vec![];
let mut sall_chars: Vec<_> = all_chars.into_iter().map(|(a, b)| (b, a)).collect();
// Reversed order
sall_chars.sort_by_key(|&a| Reverse(a));
let mut substr_index: Vec<_> = suffix
.iter()
.filter_map(|(string, freq)| {
if string.len() <= 1 {
return None;
}
if string.contains(&c_sentence_boundary) {
return None;
}
if !self.is_valid_sentencepiece(string) {
return None;
}
let score = freq * string.len() as u32;
// if let Some(p) = &progress {
// p.inc(1);
// }
Some((score, string))
})
.collect();
// Fill seed_sentencepieces
for (count, character) in sall_chars {
seed_sentencepieces.push((character.to_string(), count.into()));
}
// sort by decreasing score
substr_index.sort_by_key(|&a| Reverse(a));
for (score, char_string) in substr_index {
// Just in case
assert!(self.is_valid_sentencepiece(char_string));
let string: String = char_string.iter().collect();
seed_sentencepieces.push((string, score.into()));
if seed_sentencepieces.len() >= self.seed_size {
break;
}
}
to_log_prob(&mut seed_sentencepieces);
seed_sentencepieces
}