in core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java [34:60]
public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {
Metadata m = parse.get(url).getMetadata();
String ct = m.getFirstValue("parse.Content-Type");
if (StringUtils.isBlank(ct)) {
ct = "unknown";
} else if (ct.toLowerCase(Locale.ROOT).contains("html")) {
ct = "html";
} else if (ct.toLowerCase(Locale.ROOT).contains("pdf")) {
ct = "pdf";
} else if (ct.toLowerCase(Locale.ROOT).contains("word")) {
ct = "word";
} else if (ct.toLowerCase(Locale.ROOT).contains("excel")) {
ct = "excel";
} else if (ct.toLowerCase(Locale.ROOT).contains("powerpoint")) {
ct = "powerpoint";
} else if (ct.toLowerCase(Locale.ROOT).startsWith("video/")) {
ct = "video";
} else if (ct.toLowerCase(Locale.ROOT).startsWith("image/")) {
ct = "image";
} else if (ct.toLowerCase(Locale.ROOT).startsWith("audio/")) {
ct = "audio";
} else {
ct = "other";
}
m.setValue("format", ct);
}