public static String getCharset()

in core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java [91:126]


    public static String getCharset(
            Metadata metadata, byte[] content, int maxLengthCharsetDetection) {

        // let's look at the BOM first
        String BOMCharset = getCharsetFromBOM(content);
        if (BOMCharset != null) {
            return BOMCharset;
        }

        // then look at what we get from HTTP headers and HTML content
        String httpCharset = getCharsetFromHTTP(metadata);
        String htmlCharset = getCharsetFromMeta(content, maxLengthCharsetDetection);

        // both exist and agree
        if (httpCharset != null
                && htmlCharset != null
                && httpCharset.equalsIgnoreCase(htmlCharset)) {
            return httpCharset;
        }

        // let's guess from the text - using a hint or not
        String hintCharset = null;
        if (httpCharset != null && htmlCharset == null) {
            hintCharset = httpCharset;
        } else if (httpCharset == null && htmlCharset != null) {
            hintCharset = htmlCharset;
        }

        String textCharset = getCharsetFromText(content, hintCharset, maxLengthCharsetDetection);
        if (textCharset != null) {
            return textCharset;
        }

        // return the default charset
        return DEFAULT_CHARSET.name();
    }