in core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java [91:126]
public static String getCharset(
Metadata metadata, byte[] content, int maxLengthCharsetDetection) {
// let's look at the BOM first
String BOMCharset = getCharsetFromBOM(content);
if (BOMCharset != null) {
return BOMCharset;
}
// then look at what we get from HTTP headers and HTML content
String httpCharset = getCharsetFromHTTP(metadata);
String htmlCharset = getCharsetFromMeta(content, maxLengthCharsetDetection);
// both exist and agree
if (httpCharset != null
&& htmlCharset != null
&& httpCharset.equalsIgnoreCase(htmlCharset)) {
return httpCharset;
}
// let's guess from the text - using a hint or not
String hintCharset = null;
if (httpCharset != null && htmlCharset == null) {
hintCharset = httpCharset;
} else if (httpCharset == null && htmlCharset != null) {
hintCharset = htmlCharset;
}
String textCharset = getCharsetFromText(content, hintCharset, maxLengthCharsetDetection);
if (textCharset != null) {
return textCharset;
}
// return the default charset
return DEFAULT_CHARSET.name();
}