in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java [275:396]
public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry root) {
if (anyCaseNames == null || anyCaseNames.isEmpty()) {
return OLE;
}
Set<String> ucNames = upperCase(anyCaseNames);
MediaType mediaType = checkEncrypted(ucNames, root);
if (mediaType != null) {
return mediaType;
}
for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
if (ucNames.contains(workbookEntryName)) {
MediaType tmp = processCompObjFormatType(root);
if (tmp.equals(MS_GRAPH_CHART)) {
return MS_GRAPH_CHART;
}
return XLS;
}
}
if (ucNames.contains(SW_DOC_CONTENT_MGR) && ucNames.contains(SW_DOC_MGR_TEMP_STORAGE)) {
return SLDWORKS;
} else if (ucNames.contains(STAR_CALC_DOCUMENT)) {
// Star Office Calc
return SDC;
} else if (ucNames.contains(STAR_WRITER_DOCUMENT)) {
return SDW;
} else if (ucNames.contains(STAR_DRAW_DOCUMENT_3)) {
if (root == null) {
/*
* This is either StarOfficeDraw or StarOfficeImpress, we have
* to consult the CompObj to distinguish them, if this method is
* called in "legacy mode", without the root, just return
* x-tika-msoffice. The one-argument method is only for backward
* compatibility, if someone calls old API he/she can get the
* old result.
*/
return OLE;
} else {
return processCompObjFormatType(root);
}
} else if (ucNames.contains(WKS_SSWORK_BOOK)) {
// This check has to be before names.contains("Workbook")
// Works 7.0 spreadsheet files contain both
// we want to avoid classifying this as Excel
return XLR;
} else if (ucNames.contains("BOOK")) {
// Excel 95 or older, we won't be able to parse this....
return XLS;
} else if (ucNames.contains(WORD_DOCUMENT)) {
return DOC;
} else if (ucNames.contains(QUILL)) {
return PUB;
} else if (ucNames.contains(POWERPOINT_DOCUMENT)) {
return PPT;
} else if (ucNames.contains(VISIO_DOCUMENT)) {
return VSD;
} else if (ucNames.contains(OLE10_NATIVE_STRING)) {
return OLE10_NATIVE;
} else if (ucNames.contains(MAT_OST)) {
// this occurs on older Works Word Processor files (versions 3.0 and 4.0)
return WPS;
} else if (ucNames.contains(CONTENTS) && ucNames.contains(SPELLING)) {
// Newer Works files
return WPS;
} else if (ucNames.contains(EQUATION_NATIVE)) {
return MS_EQUATION;
} else if (ucNames.contains(OCX_NAME)) {
//active x control should be parsed as OLE, not COMP_OBJ -- TIKA-4091
//TODO -- create a mime for active x
return OLE;
} else if (ucNames.contains(CONTENTS) && ucNames.contains(OBJ_INFO)) {
return COMP_OBJ;
} else if (ucNames.contains(CONTENTS) && ucNames.contains(COMP_OBJ_STRING)) {
// CompObj is a general kind of OLE2 embedding, but this may be an old Works file
// If we have the Directory, check
if (root != null) {
MediaType type = processCompObjFormatType(root);
if (type == WPS) {
return WPS;
} else {
// Assume it's a general CompObj embedded resource
return COMP_OBJ;
}
} else {
// Assume it's a general CompObj embedded resource
return COMP_OBJ;
}
} else if (ucNames.contains(CONTENTS)) {
// CONTENTS without SPELLING nor CompObj normally means some sort
// of embedded non-office file inside an OLE2 document
// This is most commonly triggered on nested directories
return OLE;
} else if (ucNames.contains(COMP_OBJ_STRING) &&
(ucNames.contains(PROPS) || ucNames.contains(PROPS_9) ||
ucNames.contains(PROPS_12))) {
// Could be Project, look for common name patterns
for (String name : ucNames) {
if (mppDataMatch.matcher(name).matches()) {
return MPP;
}
}
} else if (ucNames.contains(LAYER)) {
//in one test file, also saw LayerSmallImage and LayerLargeImage
//maybe add those if we get false positives?
//in other test files there was a single entry for "Layer"
return ESRI_LAYER;
} else if (ucNames.contains(DGN_MF) && ucNames.contains(DGN_S) &&
ucNames.contains(DGN_H)) {
return DGN_8;
} else {
for (String name : ucNames) {
if (name.startsWith(SUBSTG_1)) {
return MSG;
}
}
}
// Couldn't detect a more specific type
return OLE;
}