in QuestionAnswering/QuestionAnswering/QuestionAnswering.swift [69:115]
func wordPieceTokenizer(_ questionOrText: String) -> [Int] {
// for each token, if it's in the vocab.txt (a key in mTokenIdMap), return its Id
// else first find the largest subtoken (at least the first letter) that exists in vocab, then
// (add "##" to the rest - even if the rest is a valid token - and get the largest token "##..."),
// and repeat the () process.
var tokenIds = [Int]();
let pattern = #"(\w+|\S)"#
let regex = try? NSRegularExpression(pattern: pattern, options: [])
let nsrange = NSRange(questionOrText.startIndex..<questionOrText.endIndex, in: questionOrText)
regex!.enumerateMatches(in: questionOrText, options: [], range: nsrange) { (match, _, stop) in
guard let match = match else { return }
let range = match.range(at:1)
if let swiftRange = Range(range, in: questionOrText) {
let token = questionOrText[swiftRange].lowercased()
if let _ = token2id[token] {
tokenIds.append(token2id[token]!)
}
else {
for i in 0 ..< token.count {
let str = String(token.prefix(token.count - i - 1))
if let tid = token2id[str] {
tokenIds.append(tid);
var subToken = String(token.suffix(i + 1))
var j = 0
while j < subToken.count {
if let subTid = token2id["##" + subToken.prefix(subToken.count - j)] {
tokenIds.append(subTid)
subToken = String(subToken.suffix(j))
j = subToken.count - j
}
else if (j == subToken.count - 1) {
tokenIds.append(token2id["##" + subToken]!)
break
}
else {
j += 1
}
}
break
}
}
}
}
}
return tokenIds
}