in Sources/Tokenizers/UnigramTokenizer.swift [92:118]
func tokenize(text: String) -> [String] {
var lattice = TokenLattice(sentence: text, bosTokenId: bosTokenId ?? 0, eosTokenId: eosTokenId ?? 0)
// Populate nodes
let sentence = lattice.sentence
var beginPos = 0
while beginPos < sentence.count {
let mblen = 1
var hasSingleNode = false
let beginIndex = sentence.index(sentence.startIndex, offsetBy: beginPos)
for token in trie.commonPrefixSearchIterator(sentence[beginIndex...]).map({ String($0) }) {
guard let tokenId = tokensToIds[token as NSString] else { fatalError("Token not in vocab: \(token)") }
let tokenScore = vocab[tokenId].score
lattice.insert(startOffset: beginPos, length: token.count, score: tokenScore, tokenId: tokenId)
if !hasSingleNode, token.count == mblen {
hasSingleNode = true
}
}
if !hasSingleNode {
lattice.insert(startOffset: beginPos, length: mblen, score: unknownTokenScore, tokenId: unknownTokenId ?? 0)
}
beginPos += mblen
}
return lattice.tokens
}