func wordPieceTokenizer()

in QuestionAnswering/QuestionAnswering/QuestionAnswering.swift [69:115]


    func wordPieceTokenizer(_ questionOrText: String) -> [Int] {
        // for each token, if it's in the vocab.txt (a key in mTokenIdMap), return its Id
        // else first find the largest subtoken (at least the first letter) that exists in vocab, then
        // (add "##" to the rest - even if the rest is a valid token - and get the largest token "##..."),
        // and repeat the () process.
        var tokenIds = [Int]();
        let pattern = #"(\w+|\S)"#
            let regex = try? NSRegularExpression(pattern: pattern, options: [])
            let nsrange = NSRange(questionOrText.startIndex..<questionOrText.endIndex, in: questionOrText)
            regex!.enumerateMatches(in: questionOrText, options: [], range: nsrange) { (match, _, stop) in
                guard let match = match else { return }
                let range = match.range(at:1)
                if let swiftRange = Range(range, in: questionOrText) {
                    let token = questionOrText[swiftRange].lowercased()
                    if let _ = token2id[token] {
                        tokenIds.append(token2id[token]!)
                    }
                    else {
                        for i in 0 ..< token.count {
                            let str = String(token.prefix(token.count - i - 1))
                            if let tid = token2id[str] {
                                tokenIds.append(tid);
                                var subToken = String(token.suffix(i + 1))
                                var j = 0
                                while j < subToken.count {
                                    if let subTid = token2id["##" + subToken.prefix(subToken.count - j)] {
                                        tokenIds.append(subTid)
                                        subToken = String(subToken.suffix(j))
                                        j = subToken.count - j
                                    }
                                    else if (j == subToken.count - 1) {
                                        tokenIds.append(token2id["##" + subToken]!)
                                        break
                                    }
                                    else {
                                        j += 1
                                    }
                                }
                                break
                            }
                        }
                    }
                }
            }

        return tokenIds
    }