private static String getCharsetFromMeta()

in core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java [174:222]


    private static String getCharsetFromMeta(byte buffer[], int maxlength) {
        // convert to UTF-8 String -- which hopefully will not mess up the
        // characters we're interested in...
        int len = buffer.length;
        if (maxlength > 0 && maxlength < len) {
            len = maxlength;
        }
        String html = new String(buffer, 0, len, DEFAULT_CHARSET);

        // fast search for e.g. <meta charset="utf-8">
        // might not get it 100% but should be frequent enough
        // and faster than parsing
        int start = html.indexOf("<meta charset=\"");
        if (start != -1) {
            int end = html.indexOf('"', start + 15);
            // https://github.com/apache/incubator-stormcrawler/issues/870
            // try on a slightly larger section of text if it is trimmed
            if (end == -1 && ((maxlength + 10) < buffer.length)) {
                return getCharsetFromMeta(buffer, maxlength + 10);
            }
            if (end == -1) {
                // there is an open tag meta but not closed = we have broken content!
                return null;
            }
            return validateCharset(html.substring(start + 15, end));
        }

        String foundCharset = null;

        try {
            Document doc = Parser.htmlParser().parseInput(html, "dummy");

            // look for <meta http-equiv="Content-Type"
            // content="text/html;charset=gb2312"> or HTML5 <meta
            // charset="gb2312">
            Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
            for (Element meta : metaElements) {
                if (meta.hasAttr("http-equiv"))
                    foundCharset = getCharsetFromContentType(meta.attr("content"));
                if (foundCharset == null && meta.hasAttr("charset"))
                    foundCharset = meta.attr("charset");
                if (foundCharset != null) return foundCharset;
            }
        } catch (Exception e) {
            foundCharset = null;
        }

        return foundCharset;
    }