in Sources/Tokenizers/Normalizer.swift [233:268]
func normalize(text: String) -> String {
// TODO: This is a simplified implementation.
// - The following comments also apply here:
// https://github.com/xenova/transformers.js/blob/main/src/tokenizers.js#L2237-L2247
// - For a proper implementation, see:
// https://github.com/huggingface/tokenizers/blob/b58227c7f1ccf8b73ee2268354336da56d91e492/tokenizers/src/normalizers/precompiled.rs#L36
var output = ""
var hasFullwidthTilde = false
for scalar in text.unicodeScalars {
switch scalar.value {
case 0x0001...0x0008, 0x000B, 0x000E...0x001F, 0x007F, 0x008F, 0x009F:
// Non-printing control characters
output.append("")
case 0x0009, 0x000A, 0x000C, 0x000D, 0x1680, 0x200B...0x200F, 0x2028, 0x2029, 0x2581,
0xFEFF, 0xFFFD:
// Separators
output.append(" ")
case 0xFF5E:
hasFullwidthTilde = true
fallthrough
default:
output.append(Character(scalar))
}
}
if hasFullwidthTilde {
return
output
.split(by: "\u{FF5E}")
.map { $0.precomposedStringWithCompatibilityMapping }
.joined(separator: "\u{FF5E}")
} else {
return output.precomposedStringWithCompatibilityMapping
}
}