in java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java [247:345]
public static int convertUTF8ToUTF16(byte[] src, int offset, int len, char[] dst) {
int end = offset + len;
int dp = 0;
while (offset < end) {
if (offset + 8 <= end
&& (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 0x8080808080808080L)
== 0) {
// ascii only
dst[dp] = (char) src[offset];
dst[dp + 1] = (char) src[offset + 1];
dst[dp + 2] = (char) src[offset + 2];
dst[dp + 3] = (char) src[offset + 3];
dst[dp + 4] = (char) src[offset + 4];
dst[dp + 5] = (char) src[offset + 5];
dst[dp + 6] = (char) src[offset + 6];
dst[dp + 7] = (char) src[offset + 7];
dp += 8;
offset += 8;
} else {
int b1 = src[offset++];
if (b1 >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
dst[dp++] = (char) b1;
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
if (offset >= end) {
return -1;
}
int b2 = src[offset++];
if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2)
return -1;
} else {
dst[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
}
} else if ((b1 >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
if (offset + 1 >= end) {
return -1;
}
int b2 = src[offset];
int b3 = src[offset + 1];
offset += 2;
if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) //
|| (b2 & 0xc0) != 0x80 //
|| (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3)
return -1;
} else {
char c =
(char)
((b1 << 12)
^ (b2 << 6)
^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80))));
boolean isSurrogate = c >= '\uD800' && c < (Character.MAX_LOW_SURROGATE + 1);
if (isSurrogate) {
return -1;
} else {
dst[dp++] = c;
}
}
} else if ((b1 >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (offset + 2 >= end) {
return -1;
}
int b2 = src[offset];
int b3 = src[offset + 1];
int b4 = src[offset + 2];
offset += 3;
int uc =
((b1 << 18)
^ (b2 << 12)
^ (b3 << 6)
^ (b4
^ (((byte) 0xF0 << 18)
^ ((byte) 0x80 << 12)
^ ((byte) 0x80 << 6)
^ ((byte) 0x80))));
if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) // isMalformed4
||
// shortest form check
!(uc >= 0x010000 && uc < 0X10FFFF + 1) // !Character.isSupplementaryCodePoint(uc)
) {
return -1;
} else {
dst[dp] =
(char)
((uc >>> 10) + ('\uD800' - (0x010000 >>> 10))); // Character.highSurrogate(uc);
dst[dp + 1] =
(char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE); // Character.lowSurrogate(uc);
dp += 2;
}
} else {
return -1;
}
}
}
return dp;
}