in tokenizers/src/models/bpe/trainer.rs [375:415]
fn count_pairs(
&self,
words: &[Word],
counts: &[u64],
p: &Option<ProgressBar>,
) -> (AHashMap<Pair, i32>, AHashMap<Pair, AHashSet<usize>>) {
words
.maybe_par_iter()
.enumerate()
.map(|(i, word)| {
let mut pair_counts = AHashMap::new();
let mut where_to_update: AHashMap<Pair, AHashSet<usize>> = AHashMap::new();
for window in word.get_chars().windows(2) {
let cur_pair: Pair = (window[0], window[1]);
// Initialize pair_counts and where_to_update for this pair if we just saw it
// Then update counts
*pair_counts.entry(cur_pair).or_default() += counts[i] as i32;
where_to_update.entry(cur_pair).or_default().insert(i);
}
if let Some(p) = &p {
p.inc(1);
}
(pair_counts, where_to_update)
})
.reduce(
|| (AHashMap::new(), AHashMap::new()),
|(mut pair_counts, mut where_to_update), (pc, wtu)| {
for (k, v) in pc {
*pair_counts.entry(k).or_default() += v;
}
for (k, v) in wtu {
where_to_update.entry(k).or_default().extend(v);
}
(pair_counts, where_to_update)
},
)
}