in Sources/Tokenizers/BPETokenizer.swift [189:201]
func tokenize(text: String) -> [String] {
var tokens: [String] = []
let bpeTokens = bpe(token: text).split(separator: " ").map { String($0) }
for token in bpeTokens {
if convertTokenToId(token) != unknownTokenId {
tokens.append(token)
} else {
// TODO: if config.byte_fallback is False, append the unknown token instead
tokens.append(contentsOf: hexaEncode(text: token))
}
}
return tokens
}