std::u16string utf8ToUtf16SIMD()

in cpp/fury/util/string_util.cc [28:149]


std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) {
  std::u16string utf16;
  utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations

  char buffer[64]; // Buffer to hold temporary UTF-16 results
  char16_t *output =
      reinterpret_cast<char16_t *>(buffer); // Use char16_t for output

  size_t i = 0;
  size_t n = utf8.size();

  while (i + 32 <= n) {

    for (int j = 0; j < 32; ++j) {
      uint8_t byte = utf8[i + j];

      if (byte < 0x80) {
        // 1-byte character (ASCII)
        *output++ = static_cast<char16_t>(byte);
      } else if (byte < 0xE0) {
        // 2-byte character
        uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F);
        if (!is_little_endian) {
          utf16_char = (utf16_char >> 8) |
                       (utf16_char << 8); // Swap bytes for big-endian
        }
        *output++ = utf16_char;
        ++j;
      } else if (byte < 0xF0) {
        // 3-byte character
        uint16_t utf16_char = ((byte & 0x0F) << 12) |
                              ((utf8[i + j + 1] & 0x3F) << 6) |
                              (utf8[i + j + 2] & 0x3F);
        if (!is_little_endian) {
          utf16_char = (utf16_char >> 8) |
                       (utf16_char << 8); // Swap bytes for big-endian
        }
        *output++ = utf16_char;
        j += 2;
      } else {
        // 4-byte character (surrogate pair handling required)
        uint32_t code_point =
            ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) |
            ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F);

        // Convert the code point to a surrogate pair
        uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
        uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);

        if (!is_little_endian) {
          high_surrogate = (high_surrogate >> 8) |
                           (high_surrogate << 8); // Swap bytes for big-endian
          low_surrogate = (low_surrogate >> 8) |
                          (low_surrogate << 8); // Swap bytes for big-endian
        }

        *output++ = high_surrogate;
        *output++ = low_surrogate;

        j += 3;
      }
    }

    // Append the processed buffer to the final utf16 string
    utf16.append(reinterpret_cast<char16_t *>(buffer),
                 output - reinterpret_cast<char16_t *>(buffer));
    output =
        reinterpret_cast<char16_t *>(buffer); // Reset output buffer pointer
    i += 32;
  }

  // Handle remaining characters
  while (i < n) {
    uint8_t byte = utf8[i];

    if (byte < 0x80) {
      *output++ = static_cast<char16_t>(byte);
    } else if (byte < 0xE0) {
      uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
      if (!is_little_endian) {
        utf16_char =
            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
      }
      *output++ = utf16_char;
      ++i;
    } else if (byte < 0xF0) {
      uint16_t utf16_char = ((byte & 0x0F) << 12) |
                            ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
      if (!is_little_endian) {
        utf16_char =
            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
      }
      *output++ = utf16_char;
      i += 2;
    } else {
      uint32_t code_point = ((byte & 0x07) << 18) |
                            ((utf8[i + 1] & 0x3F) << 12) |
                            ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);

      uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
      uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);

      if (!is_little_endian) {
        high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
        low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
      }

      *output++ = high_surrogate;
      *output++ = low_surrogate;

      i += 3;
    }

    ++i;
  }

  // Append the last part of the buffer to the utf16 string
  utf16.append(reinterpret_cast<char16_t *>(buffer),
               output - reinterpret_cast<char16_t *>(buffer));

  return utf16;
}