fn find_matches()

in tokenizers/src/tokenizer/added_vocabulary.rs [367:422]


    fn find_matches(&self, sentence: &str, split_re: &MatchingSet) -> Vec<(Option<u32>, Offsets)> {
        if sentence.is_empty() {
            return vec![(None, (0, 0))];
        }

        let mut start_offset = 0;
        let mut splits = vec![];

        for mat in split_re.0.find_iter(sentence) {
            let mut start = mat.start();
            let mut stop = mat.end();
            let aho_id = mat.pattern();
            let id = split_re.1[aho_id];
            let added_token = &self.added_tokens_map_r.get(&id).unwrap();

            if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content)
            {
                continue;
            }

            if added_token.single_word {
                let start_space = start == 0 || !ends_with_word(&sentence[..start]);
                let stop_space = stop == sentence.len() || !starts_with_word(&sentence[stop..]);

                if !stop_space || !start_space {
                    // Discard not single word
                    continue;
                }
            }
            if added_token.lstrip {
                // This will be strictly inferior to start and in correct sentence offset
                let newstart = space_leftmost_at_end(&sentence[..start]);

                // The previous match could have already matched those spaces
                // Ignore them if it's already matched
                start = std::cmp::max(newstart, start_offset);
            }
            if added_token.rstrip {
                // This will starting a the stop+1 character, so we need
                // to add the previous stop value
                stop += space_rightmost_at_start(&sentence[stop..])
            }
            if start_offset < start {
                splits.push((None, (start_offset, start)));
            }
            splits.push((Some(id), (start, stop)));
            start_offset = stop;
        }

        let total_byte_len = sentence.len();
        if start_offset != total_byte_len {
            splits.push((None, (start_offset, total_byte_len)));
        }

        splits
    }