in tokenizers/src/models/bpe/model.rs [475:496]
fn tokenize_with_cache(&self, sequence: &str) -> Result<Vec<Token>> {
if self.ignore_merges {
if let Some(id) = self.vocab.get(sequence) {
return Ok(vec![Token::new(
*id,
sequence.to_string(),
(0, sequence.len()),
)]);
}
}
if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) {
return Ok(self.word_to_tokens(hit).collect());
}
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
if sequence.len() < MAX_LENGTH {
cache.set(sequence.to_owned(), word);
}
}
Ok(ret)
}