private static Charset detectUtfCharset0()

in lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java [165:225]


  private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) {
    // spotless:off
    /*
     * Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing
     * It omits non-UTF encodings (the 2nd and 3rd rows from the end).
     * Note that the order (with respect to UTF-32 & UTF-16) is important!
     *
     * (the non-zero bytes encode the byte order mark, BOM)
     *
     * Match the 'magic bytes' in order, and take the first match:
     * 00 00 FE FF -> UTF-32 (be)
     * FF FE 00 00 -> UTF-32 (le)
     * 00 00 FF FE -> unsupported UCS-4 (byte order 2143)
     * FE FF 00 00 -> unsupported UCS-4 (byte order 3412)
     * FE FF __ __ -> UTF-16 (be)
     * FF FE __ __ -> UTF-16 (le)
     * EF BB BF __ -> UTF-8
     * 00 00 00 __ -> UTF-32BE
     * __ 00 00 00 -> UTF-32LE
     * 00 00 __ 00 -> unsupported UCS-4 (byte order 2143)
     * 00 __ 00 00 -> unsupported UCS-4 (byte order 3412)
     * 00 __ __ __ -> UTF-16BE
     * __ 00 __ __ -> UTF-16LE
     * __ __ __ __ -> UTF-8 (fallback)
     */
    // spotless:on
    int quad = quad(firstFewBytes, numBytes);
    int word = quad >>> 16;
    if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) {
      // With BOM: UTF-32 (Charset handles BOM & endianness)
      return UTF_32;
    } else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) {
      // With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412)
      return null;
    } else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) {
      // With BOM: UTF-16 (Charset handles BOM & endianness)
      return StandardCharsets.UTF_16;
    } else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) {
      // With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it)
      return StandardCharsets.UTF_8;
    } else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) {
      // Without BOM (i.e., a guess)
      return UTF_32BE;
    } else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) {
      // Without BOM (i.e., a guess)
      return UTF_32LE;
    } else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) {
      // Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143
      // resp. 3412)
      return null;
    } else if (numBytes > 1 && (word & 0xFF00) == 0) {
      // Without BOM (i.e., a guess)
      return StandardCharsets.UTF_16BE;
    } else if (numBytes > 1 && (word & 0x00FF) == 0) {
      // Without BOM (i.e., a guess)
      return StandardCharsets.UTF_16LE;
    } else {
      // Fallback
      return StandardCharsets.UTF_8;
    }
  }