in tokenizers/src/pre_tokenizers/metaspace.rs [123:147]
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, mut normalized| {
normalized.replace(' ', &self.str_rep)?;
match self.prepend_scheme {
PrependScheme::Always => {
if !normalized.get().starts_with(self.replacement) {
normalized.prepend(&self.str_rep);
}
}
PrependScheme::First => {
if !normalized.get().starts_with(self.replacement)
&& normalized.offsets_original().0 == 0
{
normalized.prepend(&self.str_rep);
}
}
PrependScheme::Never => {}
};
if self.split {
normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext)
} else {
Ok(vec![normalized])
}
})
}