in java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java [123:242]
public static int convertUTF8ToUTF16(byte[] src, int offset, int len, byte[] dst) {
final int end = offset + len;
int dp = 0;
while (offset < end) {
if (offset + 8 <= end
&& (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 0x8080808080808080L)
== 0) {
// ascii only
if (Platform.IS_LITTLE_ENDIAN) {
dst[dp] = src[offset];
dst[dp + 2] = src[offset + 1];
dst[dp + 4] = src[offset + 2];
dst[dp + 6] = src[offset + 3];
dst[dp + 8] = src[offset + 4];
dst[dp + 10] = src[offset + 5];
dst[dp + 12] = src[offset + 6];
dst[dp + 14] = src[offset + 7];
} else {
dst[dp + 1] = src[offset];
dst[dp + 3] = src[offset + 1];
dst[dp + 5] = src[offset + 2];
dst[dp + 7] = src[offset + 3];
dst[dp + 9] = src[offset + 4];
dst[dp + 11] = src[offset + 5];
dst[dp + 13] = src[offset + 6];
dst[dp + 15] = src[offset + 7];
}
dp += 16;
offset += 8;
} else {
int b0 = src[offset++];
if (b0 >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
dst[dp] = (byte) b0;
dst[dp + 1] = 0;
dp += 2;
} else if ((b0 >> 5) == -2 && (b0 & 0x1e) != 0) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
if (offset >= end) {
return -1;
}
int b1 = src[offset++];
if ((b1 & 0xc0) != 0x80) { // isNotContinuation(b2)
return -1;
} else {
char c = (char) (((b0 << 6) ^ b1) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
dst[dp] = (byte) c;
dst[dp + 1] = (byte) (c >> 8);
dp += 2;
}
} else if ((b0 >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
if (offset + 1 >= end) {
return -1;
}
int b1 = src[offset];
int b2 = src[offset + 1];
offset += 2;
if ((b0 == (byte) 0xe0 && (b1 & 0xe0) == 0x80) //
|| (b1 & 0xc0) != 0x80 //
|| (b2 & 0xc0) != 0x80) { // isMalformed3(b0, b1, b2)
return -1;
} else {
char c =
(char)
((b0 << 12)
^ (b1 << 6)
^ (b2 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80))));
boolean isSurrogate = c >= '\uD800' && c < (Character.MAX_LOW_SURROGATE + 1);
if (isSurrogate) {
return -1;
} else {
dst[dp] = (byte) c;
dst[dp + 1] = (byte) (c >> 8);
dp += 2;
}
}
} else if ((b0 >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (offset + 2 >= end) {
return -1;
}
int b2 = src[offset];
int b3 = src[offset + 1];
int b4 = src[offset + 2];
offset += 3;
int uc =
((b0 << 18)
^ (b2 << 12)
^ (b3 << 6)
^ (b4
^ (((byte) 0xF0 << 18)
^ ((byte) 0x80 << 12)
^ ((byte) 0x80 << 6)
^ ((byte) 0x80))));
if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) // isMalformed4
||
// shortest form check
!(uc >= 0x010000 && uc < 0X10FFFF + 1) // !Character.isSupplementaryCodePoint(uc)
) {
return -1;
} else {
char c = (char) ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10)));
dst[dp] = (byte) c;
dst[dp + 1] = (byte) (c >> 8);
dp += 2;
c = (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
dst[dp] = (byte) c;
dst[dp + 1] = (byte) (c >> 8);
dp += 2;
}
} else {
return -1;
}
}
}
return dp;
}