Sources/Tokenizers/Tokenizer.swift

// // Tokenizer.swift // // // Created by Pedro Cuenca on 6/5/23. // import Foundation import Hub import Jinja public typealias Message = [String: Any] public typealias ToolSpec = [String: Any] public enum TokenizerError: LocalizedError { case missingConfig case missingTokenizerClassInConfig case unsupportedTokenizer(String) case missingVocab case malformedVocab case chatTemplate(String) case missingChatTemplate case tooLong(String) case mismatchedConfig(String) public var errorDescription: String? { switch self { case .missingConfig: String(localized: "Tokenizer configuration is missing.", comment: "Error when tokenizer config cannot be found") case .missingTokenizerClassInConfig: String(localized: "The tokenizer class is not specified in the configuration.", comment: "Error when tokenizer_class is missing in config") case let .unsupportedTokenizer(name): String(localized: "The tokenizer type '\(name)' is not supported.", comment: "Error when tokenizer type is not supported") case .missingVocab: String(localized: "Vocabulary file is missing from the tokenizer configuration.", comment: "Error when vocab file is missing") case .malformedVocab: String(localized: "The vocabulary file is malformed or corrupted.", comment: "Error when vocab file is malformed") case let .chatTemplate(message): String(localized: "Chat template error: \(message)", comment: "Error with chat template") case .missingChatTemplate: String(localized: "This tokenizer does not have a chat template, and no template was passed.") case let .tooLong(message): String(localized: "Input is too long: \(message)", comment: "Error when input exceeds maximum length") case let .mismatchedConfig(message): String(localized: "Tokenizer configuration mismatch: \(message)", comment: "Error when tokenizer configuration is inconsistent") } } } public protocol TokenizingModel { func tokenize(text: String) -> [String] /// Alias for `tokenize` func callAsFunction(_ text: String) -> [String] func convertTokenToId(_ token: String) -> Int? func convertTokensToIds(_ tokens: [String]) -> [Int?] func convertIdToToken(_ id: Int) -> String? func convertIdsToTokens(_ ids: [Int]) -> [String?] var bosToken: String? { get } var bosTokenId: Int? { get } var eosToken: String? { get } var eosTokenId: Int? { get } var unknownToken: String? { get } var unknownTokenId: Int? { get } var fuseUnknownTokens: Bool { get } } /// Helper - possibly to be moved somewhere else func addedTokenAsString(_ addedToken: Config?) -> String? { guard let addedToken else { return nil } if let stringValue = addedToken.string() { return stringValue } // This is possibly a serialization of the AddedToken class // TODO: support lstrip, rstrip, normalized, etc. return addedToken.content.string() } public extension TokenizingModel { func callAsFunction(_ text: String) -> [String] { tokenize(text: text) } func convertTokensToIds(_ tokens: [String]) -> [Int?] { tokens.map { convertTokenToId($0) } } func convertIdsToTokens(_ ids: [Int]) -> [String?] { ids.map { convertIdToToken($0) } } } /// A tokenizer model that is set up with Hub configuration data public protocol PreTrainedTokenizerModel: TokenizingModel { init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String: Int]) throws } struct TokenizerModel { static let knownTokenizers: [String: PreTrainedTokenizerModel.Type] = [ "BertTokenizer": BertTokenizer.self, "DistilbertTokenizer": BertTokenizer.self, "DistilBertTokenizer": BertTokenizer.self, "RobertaTokenizer": BPETokenizer.self, "CodeGenTokenizer": CodeGenTokenizer.self, "CodeLlamaTokenizer": CodeLlamaTokenizer.self, "FalconTokenizer": FalconTokenizer.self, "GemmaTokenizer": GemmaTokenizer.self, "GPT2Tokenizer": GPT2Tokenizer.self, "LlamaTokenizer": LlamaTokenizer.self, "T5Tokenizer": T5Tokenizer.self, "WhisperTokenizer": WhisperTokenizer.self, "CohereTokenizer": CohereTokenizer.self, "Qwen2Tokenizer": Qwen2Tokenizer.self, "PreTrainedTokenizer": BPETokenizer.self, ] static func unknownToken(from tokenizerConfig: Config) -> String? { tokenizerConfig.unkToken.content.string() ?? tokenizerConfig.unkToken.string() } public static func from(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String: Int]) throws -> TokenizingModel { guard let tokenizerClassName = tokenizerConfig.tokenizerClass.string() else { throw TokenizerError.missingTokenizerClassInConfig } // Some tokenizer_class entries use a Fast suffix let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "") guard let tokenizerClass = TokenizerModel.knownTokenizers[tokenizerName] else { throw TokenizerError.unsupportedTokenizer(tokenizerName) } return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) } } public enum ChatTemplateArgument { /// A Jinja template to use for the conversation. Normally it is not necessary to provide a template, since it will be read from the tokenizer config. case literal(String) /// For models whose tokenizer config includes multiple chat templates, the template can be specified by name. Normally this is not necessary. case name(String) } public protocol Tokenizer { func tokenize(text: String) -> [String] /// Main entry point func encode(text: String) -> [Int] func encode(text: String, addSpecialTokens: Bool) -> [Int] func callAsFunction(_ text: String, addSpecialTokens: Bool) -> [Int] /// Decode func decode(tokens: [Int]) -> String func decode(tokens: [Int], skipSpecialTokens: Bool) -> String func convertTokenToId(_ token: String) -> Int? func convertTokensToIds(_ tokens: [String]) -> [Int?] func convertIdToToken(_ id: Int) -> String? func convertIdsToTokens(_ ids: [Int]) -> [String?] var bosToken: String? { get } var bosTokenId: Int? { get } var eosToken: String? { get } var eosTokenId: Int? { get } var unknownToken: String? { get } var unknownTokenId: Int? { get } var hasChatTemplate: Bool { get } /// The appropriate chat template is selected from the tokenizer config func applyChatTemplate(messages: [Message]) throws -> [Int] /// The appropriate chat template is selected from the tokenizer config func applyChatTemplate(messages: [Message], tools: [ToolSpec]?) throws -> [Int] /// The appropriate chat template is selected from the tokenizer config func applyChatTemplate(messages: [Message], tools: [ToolSpec]?, additionalContext: [String: Any]?) throws -> [Int] /// The chat template is provided as a string literal or specified by name func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws -> [Int] /// The chat template is provided as a string literal func applyChatTemplate(messages: [Message], chatTemplate: String) throws -> [Int] func applyChatTemplate( messages: [Message], // A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. chatTemplate: ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [ToolSpec]? ) throws -> [Int] func applyChatTemplate( messages: [Message], // A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. chatTemplate: ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [ToolSpec]?, additionalContext: [String: Any]? ) throws -> [Int] } extension Tokenizer { public var hasChatTemplate: Bool { false } /// Call previous signature for backwards compatibility func applyChatTemplate( messages: [Message], // A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. chatTemplate: ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [ToolSpec]?, additionalContext: [String: Any]? ) throws -> [Int] { if additionalContext == nil { try applyChatTemplate( messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools ) } else { throw TokenizerError.chatTemplate("Not implemented") } } } public extension Tokenizer { func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] { encode(text: text, addSpecialTokens: addSpecialTokens) } func decode(tokens: [Int]) -> String { decode(tokens: tokens, skipSpecialTokens: false) } func convertTokensToIds(_ tokens: [String]) -> [Int?] { tokens.map { convertTokenToId($0) } } func convertIdsToTokens(_ ids: [Int]) -> [String?] { ids.map { convertIdToToken($0) } } } let specialTokenAttributes: [String] = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", "additional_special_tokens", ] public class PreTrainedTokenizer: Tokenizer { let model: TokenizingModel public var bosToken: String? { model.bosToken } public var bosTokenId: Int? { model.bosTokenId } public var eosToken: String? { model.eosToken } public var eosTokenId: Int? { model.eosTokenId } public var unknownToken: String? { model.unknownToken } public var unknownTokenId: Int? { model.unknownTokenId } public var fuseUnknownTokens: Bool { model.fuseUnknownTokens } private let addedTokens: Set<String> private let specialTokens: [String: Int] private let addedTokensRegex: NSRegularExpression? private let preTokenizer: PreTokenizer? private let normalizer: Normalizer? private let postProcessor: PostProcessor? private let decoder: Decoder? private let tokenizerConfig: Config private let cleanUpTokenizationSpaces: Bool public required init(tokenizerConfig: Config, tokenizerData: Config) throws { var addedTokens: [String: Int] = [:] var specialTokens: [String: Int] = [:] for addedToken in tokenizerData["addedTokens"].array(or: []) { guard let id = addedToken["id"].integer() else { continue /* malformed: token with no id */ } guard let content = addedToken.content.string() else { continue /* malformed: token with no content */ } addedTokens[content] = id if addedToken["special"].boolean(or: false) { specialTokens[content] = id } } // Convert to tuples for easier access, then sort by length (descending) to avoid early partial matches // (https://github.com/xenova/transformers.js/commit/c305c3824f628f1f02806a6310bd3b18b0f7f8f5) let unwrappedAddedTokens: [(content: String, prefix: Bool, suffix: Bool)] = (tokenizerData["addedTokens"].array(or: [])).compactMap { addedToken -> (String, Bool, Bool)? in guard let content = addedToken.content.string() else { return nil } let prefix = addedToken["lstrip"].boolean(or: false) let suffix = addedToken["rstrip"].boolean(or: false) return (content: content, prefix: prefix, suffix: suffix) }.sorted { $0.content.count > $1.content.count } // then concatenate into regular expression let addedTokensRegexString = unwrappedAddedTokens.map { let token = NSRegularExpression.escapedPattern(for: $0.content) let prefix = $0.prefix ? #"\s*"# : "" let suffix = $0.suffix ? #"\s*"# : "" return "\(prefix)(\(token))\(suffix)" }.joined(separator: "|") addedTokensRegex = try? NSRegularExpression(pattern: addedTokensRegexString, options: []) // TODO: specialTokens are stored but never used self.specialTokens = specialTokens self.addedTokens = Set(addedTokens.keys) preTokenizer = PreTokenizerFactory.fromConfig(config: tokenizerData["preTokenizer"]) normalizer = NormalizerFactory.fromConfig(config: tokenizerData["normalizer"]) postProcessor = PostProcessorFactory.fromConfig(config: tokenizerData["postProcessor"]) decoder = DecoderFactory.fromConfig(config: tokenizerData["decoder"], addedTokens: self.addedTokens) cleanUpTokenizationSpaces = tokenizerConfig.cleanUpTokenizationSpaces.boolean(or: true) self.tokenizerConfig = tokenizerConfig model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) } func preTokenize(_ text: String, options: PreTokenizerOptions) -> [String] { guard let preTokenizer else { return [text] } return preTokenizer(text: text, options: options) } func normalize(_ text: String) -> String { guard let normalizer else { return text } return normalizer(text: text) } func postProcess(_ tokens: [String], addSpecialTokens: Bool = true) -> [String] { guard let postProcessor else { return tokens } return postProcessor(tokens: tokens, addSpecialTokens: addSpecialTokens) } func decodeTokens(_ tokens: [String]) -> [String] { guard let tokenDecoder = decoder else { return tokens } return tokenDecoder(tokens: tokens) } /// Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms func cleanUp(text: String) -> String { guard cleanUpTokenizationSpaces else { return text } return text .replacingOccurrences(of: " .", with: ".") .replacingOccurrences(of: " ?", with: "?") .replacingOccurrences(of: " !", with: "!") .replacingOccurrences(of: " ,", with: ",") .replacingOccurrences(of: " ' ", with: "'") .replacingOccurrences(of: " n't", with: "n't") .replacingOccurrences(of: " 'm", with: "'m") .replacingOccurrences(of: " 's", with: "'s") .replacingOccurrences(of: " 've", with: "'ve") .replacingOccurrences(of: " 're", with: "'re") } func fuseUnknown(_ tokens: [String]) -> [String] { guard fuseUnknownTokens else { return tokens } let (fused, _) = tokens.reduce((fused: [String](), previousIsUnknown: false)) { result, token in var (fused, previousIsUnknown) = result let isUnknown = model.convertTokenToId(token) == model.unknownTokenId if isUnknown { if !previousIsUnknown { fused.append(token) } } else { fused.append(token) } return (fused, isUnknown) } return fused } public func tokenize(text: String) -> [String] { // Take care of special tokens first let sections: [String] = if let regex = addedTokensRegex { text.split(by: regex) } else { [text] } return sections.enumerated().map { section, x in if addedTokens.contains(x) { return [x] } return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) } }.flatMap { fuseUnknown($0) } } /// Main entry point public func encode(text: String, addSpecialTokens: Bool = true) -> [Int] { postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { model.convertTokenToId($0)! } } public func encode(text: String) -> [Int] { encode(text: text, addSpecialTokens: true) } public func decode(tokens: [Int], skipSpecialTokens: Bool = false) -> String { // IDs to tokens let tokenStrings: [String] if skipSpecialTokens { let specialTokenIDs = Set(specialTokens.values) tokenStrings = tokens .filter { !specialTokenIDs.contains($0) } .compactMap { model.convertIdToToken($0) } } else { tokenStrings = tokens.compactMap { model.convertIdToToken($0) } } let decoded = decodeTokens(tokenStrings) // At this point we should have a single String return cleanUp(text: decoded.joined(separator: "")) } public func convertTokenToId(_ token: String) -> Int? { model.convertTokenToId(token) } public func convertIdToToken(_ id: Int) -> String? { model.convertIdToToken(id) } public var hasChatTemplate: Bool { !tokenizerConfig.chatTemplate.isNull() } public func applyChatTemplate(messages: [Message]) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true) } public func applyChatTemplate(messages: [Message], tools: [ToolSpec]? = nil) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true, tools: tools) } public func applyChatTemplate(messages: [Message], tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil) throws -> [Int] { try applyChatTemplate( messages: messages, addGenerationPrompt: true, tools: tools, additionalContext: additionalContext ) } public func applyChatTemplate(messages: [Message], chatTemplate: ChatTemplateArgument) throws -> [Int] { try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) } public func applyChatTemplate(messages: [Message], chatTemplate: String) throws -> [Int] { try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true) } public func applyChatTemplate( messages: [Message], chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, maxLength: Int? = nil, tools: [ToolSpec]? = nil ) throws -> [Int] { try applyChatTemplate( messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools, additionalContext: nil ) } public func applyChatTemplate( messages: [Message], chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, maxLength: Int? = nil, // A list of tools (callable functions) that will be accessible to the model. If the template does not // support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, // giving the name, description and argument types for the tool. See the // [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) // for more information. tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil ) throws -> [Int] { var selectedChatTemplate: String? if let chatTemplate, case let .literal(template) = chatTemplate { // Use chat template from argument selectedChatTemplate = template } else if !tokenizerConfig.chatTemplate.isNull() { let valueFromConfig: Config = tokenizerConfig.chatTemplate if let arrayValue = valueFromConfig.array() { // If the config specifies a list of chat templates, convert them to a dictionary let templateDict = [String: String]( uniqueKeysWithValues: arrayValue.compactMap { item in guard let name = item["name"].string(), let template = item["template"].string() else { return nil } return (name, template) }) if let chatTemplate, case let .name(name) = chatTemplate { // Select chat template from config by name if let matchingDictEntry = templateDict[name] { selectedChatTemplate = matchingDictEntry } else { throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config") } } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { // Use tool use chat template from config selectedChatTemplate = toolUseTemplate } else if let defaultChatTemplate = templateDict["default"] { // Use default chat template from config selectedChatTemplate = defaultChatTemplate } } else if let stringValue = valueFromConfig.string() { // Use chat template from config selectedChatTemplate = stringValue } } guard let selectedChatTemplate else { throw TokenizerError.missingChatTemplate } let template = try Template(selectedChatTemplate) var context: [String: Any] = [ "messages": messages, "add_generation_prompt": addGenerationPrompt, ] if let tools { context["tools"] = tools } if let additionalContext { /* Additional keys and values to be added to the context provided to the prompt templating engine. For example, the app could set "tools_in_user_message" to false for Llama 3.1 and 3.2 if a system message is provided. The default value is true in the Llama 3.1 and 3.2 chat templates, but these models will perform better if the tools are included in a system message. */ for (key, value) in additionalContext { context[key] = value } } for (key, value) in tokenizerConfig.dictionary(or: [:]) { if specialTokenAttributes.contains(key.string), !value.isNull() { if let stringValue = value.string() { context[key.string] = stringValue } else if let dictionary = value.dictionary() { context[key.string] = addedTokenAsString(Config(dictionary)) } else if let array: [String] = value.get() { context[key.string] = array } else { context[key.string] = value } } } let rendered = try template.render(context) var encodedTokens = encode(text: rendered, addSpecialTokens: false) var maxLength = maxLength ?? encodedTokens.count maxLength = min(maxLength, tokenizerConfig.modelMaxLength.integer() ?? maxLength) if encodedTokens.count > maxLength { if truncation { encodedTokens = Array(encodedTokens.prefix(maxLength)) } } return encodedTokens } } // MARK: - Building public struct AutoTokenizer { } struct PreTrainedTokenizerClasses { /// Class overrides for custom behaviour /// Not to be confused with the TokenizerModel classes defined in TokenizerModel static let tokenizerClasses: [String: PreTrainedTokenizer.Type] = [ "LlamaTokenizer": LlamaPreTrainedTokenizer.self, ] } public extension AutoTokenizer { internal static func tokenizerClass(for tokenizerConfig: Config) -> PreTrainedTokenizer.Type { guard let tokenizerClassName = tokenizerConfig.tokenizerClass.string() else { return PreTrainedTokenizer.self } // Some tokenizer_class entries use a Fast suffix let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "") if let tokenizerClass = PreTrainedTokenizerClasses.tokenizerClasses[tokenizerName] { return tokenizerClass } return PreTrainedTokenizer.self } static func from(tokenizerConfig: Config, tokenizerData: Config) throws -> Tokenizer { let tokenizerClass = tokenizerClass(for: tokenizerConfig) return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) } static func from( pretrained model: String, hubApi: HubApi = .shared ) async throws -> Tokenizer { let config = LanguageModelConfigurationFromHub(modelName: model, hubApi: hubApi) guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } let tokenizerData = try await config.tokenizerData return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) } static func from( modelFolder: URL, hubApi: HubApi = .shared ) async throws -> Tokenizer { let config = LanguageModelConfigurationFromHub(modelFolder: modelFolder, hubApi: hubApi) guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } let tokenizerData = try await config.tokenizerData return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) } } // MARK: - Tokenizer model classes class GPT2Tokenizer: BPETokenizer { } class FalconTokenizer: BPETokenizer { } class LlamaTokenizer: BPETokenizer { } class CodeGenTokenizer: BPETokenizer { } class WhisperTokenizer: BPETokenizer { } class GemmaTokenizer: BPETokenizer { } class CodeLlamaTokenizer: BPETokenizer { } class CohereTokenizer: BPETokenizer { } class Qwen2Tokenizer: BPETokenizer { } class T5Tokenizer: UnigramTokenizer { } // MARK: - PreTrainedTokenizer classes let sentencePieceUnderline = "▁" /// Hack for Llama tokenizers, see https://github.com/huggingface/transformers/blob/bcb841f0073fcd7a4fb88ea8064313c17dcab04a/src/transformers/models/llama/tokenization_llama_fast.py#L181 /// Return updated config, or nil func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) throws -> Config? { // If it's already a Template processor (instead of a ByteLevel one), assume it's correct let postProcessor = PostProcessorFactory.fromConfig(config: processorConfig) guard !(postProcessor is TemplateProcessing) else { return nil } let addBosToken = tokenizerConfig.addBosToken.boolean(or: false) let bosToken = addedTokenAsString(tokenizerConfig.bosToken) if addBosToken, bosToken == nil { throw TokenizerError.mismatchedConfig("add_bos_token is True but bos_token is nil") } let addEosToken = tokenizerConfig.addEosToken.boolean(or: false) let eosToken = addedTokenAsString(tokenizerConfig.eosToken) if addEosToken, eosToken == nil { throw TokenizerError.mismatchedConfig("add_eos_token is True but eos_token is nil") } // alt implementation var single: [[String: Any]] = [] if addBosToken { single = single + [["SpecialToken": ["id": bosToken!, "type_id": 0]]] } single = single + [["Sequence": ["id": "A", "type_id": 0]]] if addEosToken { single = single + [["SpecialToken": ["id": eosToken!, "type_id": 0]]] } var pair: [[String: Any]] = single if addBosToken { pair = pair + [["SpecialToken": ["id": bosToken!, "type_id": 1]]] } pair = pair + [["Sequence": ["id": "B", "type_id": 1]]] if addEosToken { pair = pair + [["SpecialToken": ["id": eosToken!, "type_id": 1]]] } let postProcessorConfig = Config(["type": PostProcessorType.TemplateProcessing.rawValue, "single": single, "pair": pair]) return postProcessorConfig } /// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions class LlamaPreTrainedTokenizer: PreTrainedTokenizer { let isLegacy: Bool required init(tokenizerConfig: Config, tokenizerData: Config) throws { isLegacy = tokenizerConfig.legacy.boolean(or: true) var configDictionary = tokenizerData.dictionary(or: [:]) if !isLegacy { _ = configDictionary.removeValue(forKey: "normalizer") configDictionary["pre_tokenizer"] = [ "type": "Metaspace", "replacement": .init(sentencePieceUnderline), "add_prefix_space": true, "prepend_scheme": "first", ] } if let postProcessorConfig = try maybeUpdatePostProcessor(tokenizerConfig: tokenizerConfig, processorConfig: tokenizerData["postProcessor"]) { configDictionary["post_processor"] = .init(postProcessorConfig.dictionary(or: [:])) } let updatedData = Config(configDictionary) try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData) } }

Sources/Tokenizers/Tokenizer.swift (549 lines of code) (raw):