in tokenizers/src/decoders/byte_fallback.rs [25:62]
fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
let mut new_tokens: Vec<String> = vec![];
let mut previous_byte_tokens: Vec<u8> = vec![];
for token in tokens {
let bytes = if token.len() == 6 && token.starts_with("<0x") && token.ends_with('>') {
u8::from_str_radix(&token[3..5], 16).ok()
} else {
None
};
if let Some(bytes) = bytes {
previous_byte_tokens.push(bytes);
} else {
if !previous_byte_tokens.is_empty() {
if let Ok(string) = String::from_utf8(previous_byte_tokens.clone()) {
new_tokens.push(string);
} else {
for _ in 0..previous_byte_tokens.len() {
new_tokens.push("�".into());
}
}
previous_byte_tokens.clear();
}
new_tokens.push(token);
}
}
if !previous_byte_tokens.is_empty() {
if let Ok(string) = String::from_utf8(previous_byte_tokens.clone()) {
new_tokens.push(string);
} else {
for _ in 0..previous_byte_tokens.len() {
new_tokens.push("�".into());
}
}
}
Ok(new_tokens)
}