in vector/src/main/java/org/apache/arrow/vector/util/Text.java [434:508]
private static Optional<Integer> validateUTF8Internal(byte[] utf8, int start, int len) {
int count = start;
int leadByte = 0;
int length = 0;
int state = LEAD_BYTE;
while (count < start + len) {
int aByte = utf8[count] & 0xFF;
switch (state) {
case LEAD_BYTE:
leadByte = aByte;
length = bytesFromUTF8[aByte];
switch (length) {
case 0: // check for ASCII
if (leadByte > 0x7F) {
return Optional.of(count);
}
break;
case 1:
if (leadByte < 0xC2 || leadByte > 0xDF) {
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 2:
if (leadByte < 0xE0 || leadByte > 0xEF) {
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 3:
if (leadByte < 0xF0 || leadByte > 0xF4) {
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
default:
// too long! Longest valid UTF-8 is 4 bytes (lead + three)
// or if < 0 we got a trail byte in the lead byte position
return Optional.of(count);
} // switch (length)
break;
case TRAIL_BYTE_1:
if (leadByte == 0xF0 && aByte < 0x90) {
return Optional.of(count);
}
if (leadByte == 0xF4 && aByte > 0x8F) {
return Optional.of(count);
}
if (leadByte == 0xE0 && aByte < 0xA0) {
return Optional.of(count);
}
if (leadByte == 0xED && aByte > 0x9F) {
return Optional.of(count);
}
// falls through to regular trail-byte test!!
case TRAIL_BYTE:
if (aByte < 0x80 || aByte > 0xBF) {
return Optional.of(count);
}
if (--length == 0) {
state = LEAD_BYTE;
} else {
state = TRAIL_BYTE;
}
break;
default:
break;
} // switch (state)
count++;
}
return Optional.empty();
}