Sources/Tokenizers/Utils.swift (54 lines of code) (raw):

// // Utils.swift // AudioBoloss // // Created by Julien Chaumond on 07/01/2019. // Copyright © 2019 Hugging Face. All rights reserved. // import Foundation struct Utils { /// Time a block in ms static func time<T>(label: String, _ block: () -> T) -> T { let startTime = CFAbsoluteTimeGetCurrent() let result = block() let diff = (CFAbsoluteTimeGetCurrent() - startTime) * 1_000 print("[\(label)] \(diff)ms") return result } /// Time a block in seconds and return (output, time) static func time<T>(_ block: () -> T) -> (T, Double) { let startTime = CFAbsoluteTimeGetCurrent() let result = block() let diff = CFAbsoluteTimeGetCurrent() - startTime return (result, diff) } /// Return unix timestamp in ms static func dateNow() -> Int64 { // Use `Int` when we don't support 32-bits devices/OSes anymore. // Int crashes on iPhone 5c. Int64(Date().timeIntervalSince1970 * 1000) } /// Clamp a val to [min, max] static func clamp<T: Comparable>(_ val: T, _ vmin: T, _ vmax: T) -> T { min(max(vmin, val), vmax) } /// Fake func that can throw. static func fakeThrowable<T>(_ input: T) throws -> T { input } /// Substring static func substr(_ s: String, _ r: Range<Int>) -> String? { let stringCount = s.count if stringCount < r.upperBound || stringCount < r.lowerBound { return nil } let startIndex = s.index(s.startIndex, offsetBy: r.lowerBound) let endIndex = s.index(startIndex, offsetBy: r.upperBound - r.lowerBound) return String(s[startIndex..<endIndex]) } /// Invert a (k, v) dictionary static func invert<K, V>(_ dict: [K: V]) -> [V: K] { var inverted: [V: K] = [:] for (k, v) in dict { inverted[v] = k } return inverted } /// Checks if a character is considered Chinese /// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) static func isChineseChar(_ c: UnicodeScalar) -> Bool { (c.value >= 0x4E00 && c.value <= 0x9FFF) || (c.value >= 0x3400 && c.value <= 0x4DBF) || (c.value >= 0x20000 && c.value <= 0x2A6DF) || (c.value >= 0x2A700 && c.value <= 0x2B73F) || (c.value >= 0x2B740 && c.value <= 0x2B81F) || (c.value >= 0x2B820 && c.value <= 0x2CEAF) || (c.value >= 0xF900 && c.value <= 0xFAFF) || (c.value >= 0x2F800 && c.value <= 0x2FA1F) } } enum Constants { static let PUNCTUATION_REGEX = #"\p{P}\u0021-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E"# }