in johnzon-core/src/main/java/org/apache/johnzon/core/RFC4627AwareInputStreamReader.java [86:155]
private static Charset getCharset(final PushbackInputStream inputStream) {
Charset charset = StandardCharsets.UTF_8;
int bomLength=0;
try {
final byte[] utfBytes = readAllBytes(inputStream);
if (utfBytes.length == 0) {
return StandardCharsets.UTF_8; // empty file -> doesn't matter anyway
}
if (utfBytes.length == 1) {
if (utfBytes[0] == 0) { // TCK shortcut since behavior is doubious
throw new JsonException("Unknown encoding");
}
inputStream.unread(utfBytes);
return StandardCharsets.UTF_8; // almost empty file -> doesn't matter neither
}
int first = (utfBytes[0] & 0xFF);
int second = (utfBytes[1] & 0xFF);
if (first == 0x00) {
charset = (second == 0x00) ? Charset.forName("UTF-32BE") : Charset.forName("UTF-16BE");
} else if (utfBytes.length > 2 && second == 0x00) {
int third = (utfBytes[2] & 0xFF);
charset = (third == 0x00) ? Charset.forName("UTF-32LE") : Charset.forName("UTF-16LE");
} else {
/*check BOM
Encoding hex byte order mark
UTF-8 EF BB BF
UTF-16 (BE) FE FF
UTF-16 (LE) FF FE
UTF-32 (BE) 00 00 FE FF
UTF-32 (LE) FF FE 00 00
*/
//We do not check for UTF-32BE because that is already covered above and we
//do not to unread anything.
if(first == 0xFE && second == 0xFF) {
charset = Charset.forName("UTF-16BE");
bomLength=2;
} else if(first == 0xFF && second == 0xFE) {
if(utfBytes.length > 3 && (utfBytes[2]&0xff) == 0x00 && (utfBytes[3]&0xff) == 0x00) {
charset = Charset.forName("UTF-32LE");
bomLength=4;
}else {
charset = Charset.forName("UTF-16LE");
bomLength=2;
}
} else if (utfBytes.length > 2 && first == 0xEF && second == 0xBB && (utfBytes[2]&0xff) == 0xBF) {
//UTF-8 with BOM
bomLength=3;
}
}
//assume UTF8
if(bomLength > 0 && bomLength < 4) {
//do not unread BOM, only bytes after BOM
inputStream.unread(utfBytes,bomLength,utfBytes.length - bomLength);
} else {
//no BOM, unread all read bytes
inputStream.unread(utfBytes);
}
} catch (final IOException e) {
throw new JsonException("Unable to detect charset due to "+e.getMessage(), e);
}
return charset;
}