in paimon-common/src/main/java/org/apache/paimon/data/BinaryString.java [827:912]
public static int decodeUTF8Strict(byte[] sa, int sp, int len, char[] da) {
final int sl = sp + len;
int dp = 0;
int dlASCII = Math.min(len, da.length);
// ASCII only optimized loop
while (dp < dlASCII && sa[sp] >= 0) {
da[dp++] = (char) sa[sp++];
}
while (sp < sl) {
int b1 = sa[sp++];
if (b1 >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
da[dp++] = (char) b1;
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
if (sp < sl) {
int b2 = sa[sp++];
if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2)
return -1;
} else {
da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
}
continue;
}
return -1;
} else if ((b1 >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
if (sp + 1 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80)
|| (b2 & 0xc0) != 0x80
|| (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3)
return -1;
} else {
char c =
(char)
((b1 << 12)
^ (b2 << 6)
^ (b3
^ (((byte) 0xE0 << 12)
^ ((byte) 0x80 << 6)
^ ((byte) 0x80))));
if (Character.isSurrogate(c)) {
return -1;
} else {
da[dp++] = c;
}
}
continue;
}
return -1;
} else if ((b1 >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (sp + 2 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
int b4 = sa[sp++];
int uc =
((b1 << 18)
^ (b2 << 12)
^ (b3 << 6)
^ (b4
^ (((byte) 0xF0 << 18)
^ ((byte) 0x80 << 12)
^ ((byte) 0x80 << 6)
^ ((byte) 0x80))));
// isMalformed4 and shortest form check
if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80)
|| !Character.isSupplementaryCodePoint(uc)) {
return -1;
} else {
da[dp++] = Character.highSurrogate(uc);
da[dp++] = Character.lowSurrogate(uc);
}
continue;
}
return -1;
} else {
return -1;
}
}
return dp;
}