in Sources/Tokenizers/BertTokenizer.swift [198:223]
func tokenize(text: String) -> [String] {
let splitTokens = maybeStripAccents(text).components(separatedBy: NSCharacterSet.whitespaces)
let tokens = splitTokens.flatMap { (token: String) -> [String] in
if neverSplit.contains(token) {
return [token]
}
var toks: [String] = []
var currentTok = ""
for c in maybeLowercase(token) {
if !c.isExtendedPunctuation {
currentTok += String(c)
} else if currentTok.count > 0 {
toks.append(currentTok)
toks.append(String(c))
currentTok = ""
} else {
toks.append(String(c))
}
}
if currentTok.count > 0 {
toks.append(currentTok)
}
return toks
}
return tokens
}