in core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java [174:222]
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
// convert to UTF-8 String -- which hopefully will not mess up the
// characters we're interested in...
int len = buffer.length;
if (maxlength > 0 && maxlength < len) {
len = maxlength;
}
String html = new String(buffer, 0, len, DEFAULT_CHARSET);
// fast search for e.g. <meta charset="utf-8">
// might not get it 100% but should be frequent enough
// and faster than parsing
int start = html.indexOf("<meta charset=\"");
if (start != -1) {
int end = html.indexOf('"', start + 15);
// https://github.com/apache/incubator-stormcrawler/issues/870
// try on a slightly larger section of text if it is trimmed
if (end == -1 && ((maxlength + 10) < buffer.length)) {
return getCharsetFromMeta(buffer, maxlength + 10);
}
if (end == -1) {
// there is an open tag meta but not closed = we have broken content!
return null;
}
return validateCharset(html.substring(start + 15, end));
}
String foundCharset = null;
try {
Document doc = Parser.htmlParser().parseInput(html, "dummy");
// look for <meta http-equiv="Content-Type"
// content="text/html;charset=gb2312"> or HTML5 <meta
// charset="gb2312">
Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
for (Element meta : metaElements) {
if (meta.hasAttr("http-equiv"))
foundCharset = getCharsetFromContentType(meta.attr("content"));
if (foundCharset == null && meta.hasAttr("charset"))
foundCharset = meta.attr("charset");
if (foundCharset != null) return foundCharset;
}
} catch (Exception e) {
foundCharset = null;
}
return foundCharset;
}