in tokenizers/src/pre_tokenizers/byte_level.rs [120:147]
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
let re_ref: &SysRegex = &RE;
pretokenized.split(|_, mut normalized| {
if self.add_prefix_space && !normalized.get().starts_with(' ') {
normalized.prepend(" ");
}
if self.use_regex {
normalized.split(re_ref, SplitDelimiterBehavior::Isolated)
} else {
Ok(vec![normalized])
}
})?;
pretokenized.normalize(|normalized| {
let s = normalized.get();
let mut transformations: Vec<(char, isize)> = Vec::with_capacity(s.len());
for (i, cur_char) in s.char_indices() {
let size = cur_char.len_utf8();
transformations.extend(
s.as_bytes()[i..i + size]
.iter()
.enumerate()
.map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))),
);
}
normalized.transform(transformations, 0);
Ok(())
})
}