in tokenizers/src/normalizers/precompiled.rs [34:68]
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
let mut transformations = Vec::with_capacity(normalized.get().len());
// Future reader. From @Narsil.
// Yes, this is weird,
// Yes, this seems broken
// No, I don't know why Google did this.
// If you question this code, check this normalizer against
// XNLI database (all languages) with Unigram model against
// Mbart, XLMRoberta *AND* Marian. If you don't get 100% or
// break a single test.
// You don't pass.
let mut modified = false;
normalized.get().graphemes(true).for_each(|grapheme| {
if grapheme.len() < 6 {
if let Some(norm) = self.transform(grapheme) {
modified = true;
replace(&mut transformations, grapheme, norm);
return;
}
}
for (char_index, c) in grapheme.char_indices() {
let part = &grapheme[char_index..char_index + c.len_utf8()];
if let Some(norm) = self.transform(part) {
modified = true;
replace(&mut transformations, part, norm);
} else {
transformations.push((c, 0));
}
}
});
if modified {
normalized.transform(transformations, 0);
}
Ok(())
}