fn tokenize()

in tokenizers/src/models/unigram/model.rs [419:453]


    fn tokenize(&self, sentence: &str) -> Result<Vec<Token>> {
        let str_tokens = self.encode(sentence)?;
        let mut offset = 0;
        let mut tokens = Vec::with_capacity(str_tokens.len());
        for string in str_tokens {
            let len = string.len();
            let offsets = (offset, offset + len);
            let id: u32 = match self.token_to_ids.get(&string) {
                Some(id) => *id,
                None => {
                    if self.byte_fallback {
                        let byte_tokens: Option<Vec<_>> = string
                            .bytes()
                            .map(|byte| -> Option<Token> {
                                let byte_string = format!("<0x{byte:02X}>");
                                let id = self.token_to_ids.get(&byte_string);
                                id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
                            })
                            .collect();
                        if let Some(byte_tokens) = byte_tokens {
                            for token in byte_tokens {
                                tokens.push(token);
                            }
                            offset += len;
                            continue;
                        }
                    }
                    self.unk_id.ok_or(UnigramError::MissingUnkId)? as u32
                }
            };
            offset += len;
            tokens.push(Token::new(id, string, offsets));
        }
        Ok(tokens)
    }