in tokenizers/src/tokenizer/pattern.rs [89:122]
fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}
let mut last_offset = 0;
let mut last_seen = 0;
let mut matches = inside
.char_indices()
.flat_map(|(b, c)| {
last_seen = b + c.len_utf8();
if self(c) {
let mut events = Vec::with_capacity(2);
if last_offset < b {
// We need to emit what was before this match
events.push(((last_offset, b), false));
}
events.push(((b, b + c.len_utf8()), true));
last_offset = b + c.len_utf8();
events
} else {
vec![]
}
})
.collect::<Vec<_>>();
// Do not forget the last potential split
if last_seen > last_offset {
matches.push(((last_offset, last_seen), false));
}
Ok(matches)
}