in tokenizers/src/models/bpe/model.rs [382:467]
fn merge_word(&self, w: &str) -> Result<Word> {
let mut indices = w.char_indices().map(|(idx, _)| idx).peekable();
let mut word = Word::with_capacity(w.len());
let mut unk: Option<(u32, usize)> = None;
while let Some(i) = indices.next() {
let end = indices.peek();
let is_first = i == 0;
let is_last = end.is_none();
let mut s = if let Some(e) = end {
Cow::Borrowed(&w[i..*e])
} else {
Cow::Borrowed(&w[i..])
};
let byte_len = s.len();
// Add the `continuing_subword_prefix` if relevant
if !is_first {
if let Some(ref prefix) = self.continuing_subword_prefix {
s = format!("{prefix}{s}").into()
}
}
// Add the `end_of_word_suffix` if relevant
if is_last {
if let Some(ref suffix) = self.end_of_word_suffix {
s = format!("{s}{suffix}").into()
}
}
if let Some(id) = self.vocab.get(s.as_ref()) {
if let Some((unk_id, unk_len)) = unk {
word.add(unk_id, unk_len);
unk = None;
}
word.add(*id, byte_len);
} else {
if self.byte_fallback {
let tokens: Option<Vec<_>> = s
.bytes()
.map(|b| -> Option<&u32> {
let code = format!("<{b:#04X}>");
self.vocab.get(&code)
})
.collect();
if let Some(tokens) = tokens {
for t in tokens {
word.add(*t, 1);
}
continue;
}
}
if let Some(unk_token) = &self.unk_token {
unk = match (unk, self.fuse_unk) {
(Some((unk_id, unk_len)), true) => {
// Fuse unk
Some((unk_id, unk_len + byte_len))
}
(Some((unk_id, unk_len)), false) => {
// Do not fuse unk, add the previous one
word.add(unk_id, unk_len);
Some((
*self.vocab.get(unk_token).ok_or_else(|| {
Error::UnkTokenOutOfVocabulary(unk_token.to_owned())
})?,
byte_len,
))
}
_ => Some((
*self.vocab.get(unk_token).ok_or_else(|| {
Error::UnkTokenOutOfVocabulary(unk_token.to_owned())
})?,
byte_len,
)),
};
}
}
}
if let Some((unk_id, unk_len)) = unk {
word.add(unk_id, unk_len);
}
word.merge_all(&self.merges, self.dropout);
Ok(word)
}