in lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java [165:225]
private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) {
// spotless:off
/*
* Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing
* It omits non-UTF encodings (the 2nd and 3rd rows from the end).
* Note that the order (with respect to UTF-32 & UTF-16) is important!
*
* (the non-zero bytes encode the byte order mark, BOM)
*
* Match the 'magic bytes' in order, and take the first match:
* 00 00 FE FF -> UTF-32 (be)
* FF FE 00 00 -> UTF-32 (le)
* 00 00 FF FE -> unsupported UCS-4 (byte order 2143)
* FE FF 00 00 -> unsupported UCS-4 (byte order 3412)
* FE FF __ __ -> UTF-16 (be)
* FF FE __ __ -> UTF-16 (le)
* EF BB BF __ -> UTF-8
* 00 00 00 __ -> UTF-32BE
* __ 00 00 00 -> UTF-32LE
* 00 00 __ 00 -> unsupported UCS-4 (byte order 2143)
* 00 __ 00 00 -> unsupported UCS-4 (byte order 3412)
* 00 __ __ __ -> UTF-16BE
* __ 00 __ __ -> UTF-16LE
* __ __ __ __ -> UTF-8 (fallback)
*/
// spotless:on
int quad = quad(firstFewBytes, numBytes);
int word = quad >>> 16;
if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) {
// With BOM: UTF-32 (Charset handles BOM & endianness)
return UTF_32;
} else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) {
// With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412)
return null;
} else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) {
// With BOM: UTF-16 (Charset handles BOM & endianness)
return StandardCharsets.UTF_16;
} else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) {
// With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it)
return StandardCharsets.UTF_8;
} else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) {
// Without BOM (i.e., a guess)
return UTF_32BE;
} else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) {
// Without BOM (i.e., a guess)
return UTF_32LE;
} else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) {
// Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143
// resp. 3412)
return null;
} else if (numBytes > 1 && (word & 0xFF00) == 0) {
// Without BOM (i.e., a guess)
return StandardCharsets.UTF_16BE;
} else if (numBytes > 1 && (word & 0x00FF) == 0) {
// Without BOM (i.e., a guess)
return StandardCharsets.UTF_16LE;
} else {
// Fallback
return StandardCharsets.UTF_8;
}
}