public static String detectXmlFileEncoding()

in uimaj-core/src/main/java/org/apache/uima/pear/util/XMLUtil.java [156:335]


  public static String detectXmlFileEncoding(File xmlFile) throws IOException {
    String encoding = null;
    FileInputStream iStream = null;
    BufferedReader fReader = null;
    try {
      // first, make sure - this is a valid XML file
      if (!isValidXmlFile(xmlFile)) {
        return null;
      }
      iStream = new FileInputStream(xmlFile);
      // read prefix - possible BOM or signature
      int byteCounter = 0;
      int nextByte = 0;
      int[] prefix = new int[16];
      do {
        nextByte = iStream.read();
        // store as possible UTF signature or BOM
        if (byteCounter < 16) {
          prefix[byteCounter] = nextByte;
        }
        byteCounter++;
        if (nextByte < 0) {
          throw new IOException("cannot read file");
        }
      } while (nextByte == 0xEF || nextByte == 0xBB || nextByte == 0xBF || nextByte == 0xFE
              || nextByte == 0xFF || nextByte == 0x00);
      int prefixLength = byteCounter < 17 ? byteCounter - 1 : 16;
      String utfSignature = (prefixLength > 0) ? FileUtil.identifyUtfSignature(prefix, prefixLength)
              : null;
      boolean utf8Signature = false;
      boolean utf16Signature = false;
      boolean utf32Signature = false;
      if (utfSignature != null) {
        // check signature name
        if (utfSignature.startsWith("UTF-8")) {
          utf8Signature = true;
        } else if (utfSignature.startsWith("UTF-16")) {
          utf16Signature = true;
        } else if (utfSignature.startsWith("UTF-32")) {
          utf32Signature = true;
        }
      }
      byte[] buffer = null;
      int bytes2put = 0;
      // if signature for UTF-16 or UTF-32 exists - put it to the buffer
      if (utf16Signature) {
        // UTF-16 - put 2 bytes of signature + 7x2 bytes
        bytes2put = 7 * 2; // <?xml?>
        buffer = new byte[prefixLength + bytes2put];
        for (int i = 0; i < prefixLength; i++) {
          buffer[i] = (byte) prefix[i];
        }
        byteCounter = prefixLength;
      } else if (utf32Signature) {
        // UTF-32 - put 4 bytes of signature + 7x4 bytes
        bytes2put = 7 * 4; // <?xml?>
        buffer = new byte[prefixLength + bytes2put];
        for (int i = 0; i < prefixLength; i++) {
          buffer[i] = (byte) prefix[i];
        }
        byteCounter = prefixLength;
      } else {
        // UTF8 or no signature - put only text characters
        bytes2put = 7; // <?xml?>
        buffer = new byte[bytes2put];
        byteCounter = 0;
      }
      // store the 1st text byte and read next 6 bytes of XML file
      buffer[byteCounter++] = (byte) nextByte;
      // this next bit is because the "read(...)" is not obliged to return all the bytes
      // and must be put in a while loop to guarantee getting them
      int offset = 0;
      while (offset < (bytes2put - 1)) {
        int bytesRead = iStream.read(buffer, offset + byteCounter, bytes2put - 1 - offset);
        if (bytesRead == -1) {
          break;
        }
        offset += bytesRead;
      }
      if (offset != (bytes2put - 1)) {
        throw new IOException("cannot read file");
      }
      // check first XML header characters - '<?'
      // buffer is 7 bytes
      // some Javas won't properly decode an odd number of bytes for utf16 coding
      // https://issues.apache.org/jira/browse/UIMA-2099
      byte[] buffer6 = new byte[6];
      System.arraycopy(buffer, 0, buffer6, 0, 6);
      if (utf8Signature) {
        // check for UTF-8
        String test = new String(buffer, StandardCharsets.UTF_8);
        if (test.startsWith(FIRST_XML_CHARS)) {
          encoding = "UTF-8";
        }
      } else if (utf16Signature) {
        // check for UTF-16
        String test = new String(buffer6, StandardCharsets.UTF_16);
        if (test.startsWith(FIRST_XML_CHARS)) {
          encoding = "UTF-16";
        }
      } else if (utf32Signature) {
        // we don't support this
      } else {
        // no signature - check for UTF-8 in XML header characters
        String test = new String(buffer, StandardCharsets.UTF_8);
        if (test.startsWith(FIRST_XML_CHARS)) {
          encoding = "UTF-8";
        } else {
          // next, check for UTF-16LE in XML header characters
          test = new String(buffer6, StandardCharsets.UTF_16LE);
          if (test.startsWith(FIRST_XML_CHARS)) {
            encoding = "UTF-16LE";
          } else {
            // next, check for UTF-16BE in XML header characters
            test = new String(buffer6, StandardCharsets.UTF_16BE);
            if (test.startsWith(FIRST_XML_CHARS)) {
              encoding = "UTF-16BE";
            }
          }
        }
      }
      iStream.close();
      if (encoding == null) {
        // last resort: check 1st non-space XML character - '<'
        // check 1st non-space XML character for UTF-8
        fReader = new BufferedReader(
                new InputStreamReader(new FileInputStream(xmlFile), StandardCharsets.UTF_8));
        String line = null;
        try {
          while ((line = fReader.readLine()) != null) {
            String xmlLine = line.trim();
            if (xmlLine.length() > 0) {
              if (xmlLine.charAt(0) == '<') {
                encoding = "UTF-8";
              }
              break;
            }
          }
        } catch (CharConversionException err) {
        }
        fReader.close();
        if (encoding == null) {
          // check 1st non-space XML character for UTF-16
          fReader = new BufferedReader(
                  new InputStreamReader(new FileInputStream(xmlFile), StandardCharsets.UTF_16));
          try {
            while ((line = fReader.readLine()) != null) {
              String xmlLine = line.trim();
              if (xmlLine.length() > 0) {
                if (xmlLine.charAt(0) == '<') {
                  encoding = "UTF-16";
                }
                break;
              }
            }
          } catch (CharConversionException err) {
          }
          fReader.close();
        }
      }
    } catch (IOException exc) {
      throw exc;
    } catch (Throwable err) {
      throw new IOException(err.toString());
    } finally {
      if (iStream != null) {
        try {
          iStream.close();
        } catch (Exception e) {
        }
      }
      if (fReader != null) {
        try {
          fReader.close();
        } catch (Exception e) {
        }
      }
    }
    return encoding;
  } // detectXmlFileEncoding()