in tokenizers/src/tokenizer/added_vocabulary.rs [367:422]
fn find_matches(&self, sentence: &str, split_re: &MatchingSet) -> Vec<(Option<u32>, Offsets)> {
if sentence.is_empty() {
return vec![(None, (0, 0))];
}
let mut start_offset = 0;
let mut splits = vec![];
for mat in split_re.0.find_iter(sentence) {
let mut start = mat.start();
let mut stop = mat.end();
let aho_id = mat.pattern();
let id = split_re.1[aho_id];
let added_token = &self.added_tokens_map_r.get(&id).unwrap();
if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content)
{
continue;
}
if added_token.single_word {
let start_space = start == 0 || !ends_with_word(&sentence[..start]);
let stop_space = stop == sentence.len() || !starts_with_word(&sentence[stop..]);
if !stop_space || !start_space {
// Discard not single word
continue;
}
}
if added_token.lstrip {
// This will be strictly inferior to start and in correct sentence offset
let newstart = space_leftmost_at_end(&sentence[..start]);
// The previous match could have already matched those spaces
// Ignore them if it's already matched
start = std::cmp::max(newstart, start_offset);
}
if added_token.rstrip {
// This will starting a the stop+1 character, so we need
// to add the previous stop value
stop += space_rightmost_at_start(&sentence[stop..])
}
if start_offset < start {
splits.push((None, (start_offset, start)));
}
splits.push((Some(id), (start, stop)));
start_offset = stop;
}
let total_byte_len = sentence.len();
if start_offset != total_byte_len {
splits.push((None, (start_offset, total_byte_len)));
}
splits
}