func normalize()

in Sources/Tokenizers/Normalizer.swift [233:268]


    func normalize(text: String) -> String {
        // TODO: This is a simplified implementation.
        // - The following comments also apply here:
        // https://github.com/xenova/transformers.js/blob/main/src/tokenizers.js#L2237-L2247
        // - For a proper implementation, see:
        // https://github.com/huggingface/tokenizers/blob/b58227c7f1ccf8b73ee2268354336da56d91e492/tokenizers/src/normalizers/precompiled.rs#L36
        var output = ""
        var hasFullwidthTilde = false

        for scalar in text.unicodeScalars {
            switch scalar.value {
            case 0x0001...0x0008, 0x000B, 0x000E...0x001F, 0x007F, 0x008F, 0x009F:
                // Non-printing control characters
                output.append("")
            case 0x0009, 0x000A, 0x000C, 0x000D, 0x1680, 0x200B...0x200F, 0x2028, 0x2029, 0x2581,
                 0xFEFF, 0xFFFD:
                // Separators
                output.append(" ")
            case 0xFF5E:
                hasFullwidthTilde = true
                fallthrough
            default:
                output.append(Character(scalar))
            }
        }

        if hasFullwidthTilde {
            return
                output
                    .split(by: "\u{FF5E}")
                    .map { $0.precomposedStringWithCompatibilityMapping }
                    .joined(separator: "\u{FF5E}")
        } else {
            return output.precomposedStringWithCompatibilityMapping
        }
    }