public static int convertUTF8ToUTF16()

in java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java [123:242]


  public static int convertUTF8ToUTF16(byte[] src, int offset, int len, byte[] dst) {
    final int end = offset + len;
    int dp = 0;

    while (offset < end) {
      if (offset + 8 <= end
          && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 0x8080808080808080L)
              == 0) {
        // ascii only
        if (Platform.IS_LITTLE_ENDIAN) {
          dst[dp] = src[offset];
          dst[dp + 2] = src[offset + 1];
          dst[dp + 4] = src[offset + 2];
          dst[dp + 6] = src[offset + 3];
          dst[dp + 8] = src[offset + 4];
          dst[dp + 10] = src[offset + 5];
          dst[dp + 12] = src[offset + 6];
          dst[dp + 14] = src[offset + 7];
        } else {
          dst[dp + 1] = src[offset];
          dst[dp + 3] = src[offset + 1];
          dst[dp + 5] = src[offset + 2];
          dst[dp + 7] = src[offset + 3];
          dst[dp + 9] = src[offset + 4];
          dst[dp + 11] = src[offset + 5];
          dst[dp + 13] = src[offset + 6];
          dst[dp + 15] = src[offset + 7];
        }
        dp += 16;
        offset += 8;
      } else {
        int b0 = src[offset++];
        if (b0 >= 0) {
          // 1 byte, 7 bits: 0xxxxxxx
          dst[dp] = (byte) b0;
          dst[dp + 1] = 0;
          dp += 2;
        } else if ((b0 >> 5) == -2 && (b0 & 0x1e) != 0) {
          // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
          if (offset >= end) {
            return -1;
          }
          int b1 = src[offset++];
          if ((b1 & 0xc0) != 0x80) { // isNotContinuation(b2)
            return -1;
          } else {
            char c = (char) (((b0 << 6) ^ b1) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
            dst[dp] = (byte) c;
            dst[dp + 1] = (byte) (c >> 8);
            dp += 2;
          }
        } else if ((b0 >> 4) == -2) {
          // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
          if (offset + 1 >= end) {
            return -1;
          }
          int b1 = src[offset];
          int b2 = src[offset + 1];
          offset += 2;
          if ((b0 == (byte) 0xe0 && (b1 & 0xe0) == 0x80) //
              || (b1 & 0xc0) != 0x80 //
              || (b2 & 0xc0) != 0x80) { // isMalformed3(b0, b1, b2)
            return -1;
          } else {
            char c =
                (char)
                    ((b0 << 12)
                        ^ (b1 << 6)
                        ^ (b2 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80))));
            boolean isSurrogate = c >= '\uD800' && c < (Character.MAX_LOW_SURROGATE + 1);
            if (isSurrogate) {
              return -1;
            } else {
              dst[dp] = (byte) c;
              dst[dp + 1] = (byte) (c >> 8);
              dp += 2;
            }
          }
        } else if ((b0 >> 3) == -2) {
          // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
          if (offset + 2 >= end) {
            return -1;
          }
          int b2 = src[offset];
          int b3 = src[offset + 1];
          int b4 = src[offset + 2];
          offset += 3;
          int uc =
              ((b0 << 18)
                  ^ (b2 << 12)
                  ^ (b3 << 6)
                  ^ (b4
                      ^ (((byte) 0xF0 << 18)
                          ^ ((byte) 0x80 << 12)
                          ^ ((byte) 0x80 << 6)
                          ^ ((byte) 0x80))));
          if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) // isMalformed4
              ||
              // shortest form check
              !(uc >= 0x010000 && uc < 0X10FFFF + 1) // !Character.isSupplementaryCodePoint(uc)
          ) {
            return -1;
          } else {
            char c = (char) ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10)));
            dst[dp] = (byte) c;
            dst[dp + 1] = (byte) (c >> 8);
            dp += 2;

            c = (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
            dst[dp] = (byte) c;
            dst[dp + 1] = (byte) (c >> 8);
            dp += 2;
          }
        } else {
          return -1;
        }
      }
    }
    return dp;
  }