public void filter()

in core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java [34:60]


    public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {

        Metadata m = parse.get(url).getMetadata();
        String ct = m.getFirstValue("parse.Content-Type");
        if (StringUtils.isBlank(ct)) {
            ct = "unknown";
        } else if (ct.toLowerCase(Locale.ROOT).contains("html")) {
            ct = "html";
        } else if (ct.toLowerCase(Locale.ROOT).contains("pdf")) {
            ct = "pdf";
        } else if (ct.toLowerCase(Locale.ROOT).contains("word")) {
            ct = "word";
        } else if (ct.toLowerCase(Locale.ROOT).contains("excel")) {
            ct = "excel";
        } else if (ct.toLowerCase(Locale.ROOT).contains("powerpoint")) {
            ct = "powerpoint";
        } else if (ct.toLowerCase(Locale.ROOT).startsWith("video/")) {
            ct = "video";
        } else if (ct.toLowerCase(Locale.ROOT).startsWith("image/")) {
            ct = "image";
        } else if (ct.toLowerCase(Locale.ROOT).startsWith("audio/")) {
            ct = "audio";
        } else {
            ct = "other";
        }
        m.setValue("format", ct);
    }