in uimaj-core/src/main/java/org/apache/uima/pear/util/XMLUtil.java [156:335]
public static String detectXmlFileEncoding(File xmlFile) throws IOException {
String encoding = null;
FileInputStream iStream = null;
BufferedReader fReader = null;
try {
// first, make sure - this is a valid XML file
if (!isValidXmlFile(xmlFile)) {
return null;
}
iStream = new FileInputStream(xmlFile);
// read prefix - possible BOM or signature
int byteCounter = 0;
int nextByte = 0;
int[] prefix = new int[16];
do {
nextByte = iStream.read();
// store as possible UTF signature or BOM
if (byteCounter < 16) {
prefix[byteCounter] = nextByte;
}
byteCounter++;
if (nextByte < 0) {
throw new IOException("cannot read file");
}
} while (nextByte == 0xEF || nextByte == 0xBB || nextByte == 0xBF || nextByte == 0xFE
|| nextByte == 0xFF || nextByte == 0x00);
int prefixLength = byteCounter < 17 ? byteCounter - 1 : 16;
String utfSignature = (prefixLength > 0) ? FileUtil.identifyUtfSignature(prefix, prefixLength)
: null;
boolean utf8Signature = false;
boolean utf16Signature = false;
boolean utf32Signature = false;
if (utfSignature != null) {
// check signature name
if (utfSignature.startsWith("UTF-8")) {
utf8Signature = true;
} else if (utfSignature.startsWith("UTF-16")) {
utf16Signature = true;
} else if (utfSignature.startsWith("UTF-32")) {
utf32Signature = true;
}
}
byte[] buffer = null;
int bytes2put = 0;
// if signature for UTF-16 or UTF-32 exists - put it to the buffer
if (utf16Signature) {
// UTF-16 - put 2 bytes of signature + 7x2 bytes
bytes2put = 7 * 2; // <?xml?>
buffer = new byte[prefixLength + bytes2put];
for (int i = 0; i < prefixLength; i++) {
buffer[i] = (byte) prefix[i];
}
byteCounter = prefixLength;
} else if (utf32Signature) {
// UTF-32 - put 4 bytes of signature + 7x4 bytes
bytes2put = 7 * 4; // <?xml?>
buffer = new byte[prefixLength + bytes2put];
for (int i = 0; i < prefixLength; i++) {
buffer[i] = (byte) prefix[i];
}
byteCounter = prefixLength;
} else {
// UTF8 or no signature - put only text characters
bytes2put = 7; // <?xml?>
buffer = new byte[bytes2put];
byteCounter = 0;
}
// store the 1st text byte and read next 6 bytes of XML file
buffer[byteCounter++] = (byte) nextByte;
// this next bit is because the "read(...)" is not obliged to return all the bytes
// and must be put in a while loop to guarantee getting them
int offset = 0;
while (offset < (bytes2put - 1)) {
int bytesRead = iStream.read(buffer, offset + byteCounter, bytes2put - 1 - offset);
if (bytesRead == -1) {
break;
}
offset += bytesRead;
}
if (offset != (bytes2put - 1)) {
throw new IOException("cannot read file");
}
// check first XML header characters - '<?'
// buffer is 7 bytes
// some Javas won't properly decode an odd number of bytes for utf16 coding
// https://issues.apache.org/jira/browse/UIMA-2099
byte[] buffer6 = new byte[6];
System.arraycopy(buffer, 0, buffer6, 0, 6);
if (utf8Signature) {
// check for UTF-8
String test = new String(buffer, StandardCharsets.UTF_8);
if (test.startsWith(FIRST_XML_CHARS)) {
encoding = "UTF-8";
}
} else if (utf16Signature) {
// check for UTF-16
String test = new String(buffer6, StandardCharsets.UTF_16);
if (test.startsWith(FIRST_XML_CHARS)) {
encoding = "UTF-16";
}
} else if (utf32Signature) {
// we don't support this
} else {
// no signature - check for UTF-8 in XML header characters
String test = new String(buffer, StandardCharsets.UTF_8);
if (test.startsWith(FIRST_XML_CHARS)) {
encoding = "UTF-8";
} else {
// next, check for UTF-16LE in XML header characters
test = new String(buffer6, StandardCharsets.UTF_16LE);
if (test.startsWith(FIRST_XML_CHARS)) {
encoding = "UTF-16LE";
} else {
// next, check for UTF-16BE in XML header characters
test = new String(buffer6, StandardCharsets.UTF_16BE);
if (test.startsWith(FIRST_XML_CHARS)) {
encoding = "UTF-16BE";
}
}
}
}
iStream.close();
if (encoding == null) {
// last resort: check 1st non-space XML character - '<'
// check 1st non-space XML character for UTF-8
fReader = new BufferedReader(
new InputStreamReader(new FileInputStream(xmlFile), StandardCharsets.UTF_8));
String line = null;
try {
while ((line = fReader.readLine()) != null) {
String xmlLine = line.trim();
if (xmlLine.length() > 0) {
if (xmlLine.charAt(0) == '<') {
encoding = "UTF-8";
}
break;
}
}
} catch (CharConversionException err) {
}
fReader.close();
if (encoding == null) {
// check 1st non-space XML character for UTF-16
fReader = new BufferedReader(
new InputStreamReader(new FileInputStream(xmlFile), StandardCharsets.UTF_16));
try {
while ((line = fReader.readLine()) != null) {
String xmlLine = line.trim();
if (xmlLine.length() > 0) {
if (xmlLine.charAt(0) == '<') {
encoding = "UTF-16";
}
break;
}
}
} catch (CharConversionException err) {
}
fReader.close();
}
}
} catch (IOException exc) {
throw exc;
} catch (Throwable err) {
throw new IOException(err.toString());
} finally {
if (iStream != null) {
try {
iStream.close();
} catch (Exception e) {
}
}
if (fReader != null) {
try {
fReader.close();
} catch (Exception e) {
}
}
}
return encoding;
} // detectXmlFileEncoding()