in cpp/fury/util/string_util.cc [28:149]
std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) {
std::u16string utf16;
utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations
char buffer[64]; // Buffer to hold temporary UTF-16 results
char16_t *output =
reinterpret_cast<char16_t *>(buffer); // Use char16_t for output
size_t i = 0;
size_t n = utf8.size();
while (i + 32 <= n) {
for (int j = 0; j < 32; ++j) {
uint8_t byte = utf8[i + j];
if (byte < 0x80) {
// 1-byte character (ASCII)
*output++ = static_cast<char16_t>(byte);
} else if (byte < 0xE0) {
// 2-byte character
uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F);
if (!is_little_endian) {
utf16_char = (utf16_char >> 8) |
(utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
++j;
} else if (byte < 0xF0) {
// 3-byte character
uint16_t utf16_char = ((byte & 0x0F) << 12) |
((utf8[i + j + 1] & 0x3F) << 6) |
(utf8[i + j + 2] & 0x3F);
if (!is_little_endian) {
utf16_char = (utf16_char >> 8) |
(utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
j += 2;
} else {
// 4-byte character (surrogate pair handling required)
uint32_t code_point =
((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) |
((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F);
// Convert the code point to a surrogate pair
uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
if (!is_little_endian) {
high_surrogate = (high_surrogate >> 8) |
(high_surrogate << 8); // Swap bytes for big-endian
low_surrogate = (low_surrogate >> 8) |
(low_surrogate << 8); // Swap bytes for big-endian
}
*output++ = high_surrogate;
*output++ = low_surrogate;
j += 3;
}
}
// Append the processed buffer to the final utf16 string
utf16.append(reinterpret_cast<char16_t *>(buffer),
output - reinterpret_cast<char16_t *>(buffer));
output =
reinterpret_cast<char16_t *>(buffer); // Reset output buffer pointer
i += 32;
}
// Handle remaining characters
while (i < n) {
uint8_t byte = utf8[i];
if (byte < 0x80) {
*output++ = static_cast<char16_t>(byte);
} else if (byte < 0xE0) {
uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
if (!is_little_endian) {
utf16_char =
(utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
++i;
} else if (byte < 0xF0) {
uint16_t utf16_char = ((byte & 0x0F) << 12) |
((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
if (!is_little_endian) {
utf16_char =
(utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
i += 2;
} else {
uint32_t code_point = ((byte & 0x07) << 18) |
((utf8[i + 1] & 0x3F) << 12) |
((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
if (!is_little_endian) {
high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
}
*output++ = high_surrogate;
*output++ = low_surrogate;
i += 3;
}
++i;
}
// Append the last part of the buffer to the utf16 string
utf16.append(reinterpret_cast<char16_t *>(buffer),
output - reinterpret_cast<char16_t *>(buffer));
return utf16;
}