in Sources/Tokenizers/BPETokenizer.swift [143:187]
func bpe(token: String) -> String {
if token.count <= 1 {
return token
}
var word = Array(token).map { String($0) }
var pairs = Array(getPairs(word: word))
while true {
let bigrams = pairs.filter { bp -> Bool in bpeRanks[bp] != nil }
if bigrams.count == 0 {
break
}
let bigram = bigrams.min { bp1, bp2 -> Bool in
return bpeRanks[bp1]! < bpeRanks[bp2]!
}!
let first = bigram.a
let second = bigram.b
var newWord: [String] = []
var i = 0
while i < word.count {
if let j = word[i..<word.count].firstIndex(of: first) {
newWord.append(contentsOf: word[i..<j])
i = j
} else {
newWord.append(contentsOf: word[i..<word.count])
break
}
if word[i] == first, i < word.count - 1, word[i + 1] == second {
newWord.append(first + second)
i += 2
} else {
newWord.append(word[i])
i += 1
}
}
word = newWord
if word.count == 1 {
break
} else {
pairs = Array(getPairs(word: word))
}
}
return word.joined(separator: " ")
}