public static MediaType detect()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java [275:396]


    public static MediaType detect(Set<String> anyCaseNames, DirectoryEntry root) {
        if (anyCaseNames == null || anyCaseNames.isEmpty()) {
            return OLE;
        }

        Set<String> ucNames = upperCase(anyCaseNames);
        MediaType mediaType = checkEncrypted(ucNames, root);
        if (mediaType != null) {
            return mediaType;
        }

        for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
            if (ucNames.contains(workbookEntryName)) {
                MediaType tmp = processCompObjFormatType(root);
                if (tmp.equals(MS_GRAPH_CHART)) {
                    return MS_GRAPH_CHART;
                }
                return XLS;
            }
        }
        if (ucNames.contains(SW_DOC_CONTENT_MGR) && ucNames.contains(SW_DOC_MGR_TEMP_STORAGE)) {
            return SLDWORKS;
        } else if (ucNames.contains(STAR_CALC_DOCUMENT)) {
            // Star Office Calc
            return SDC;
        } else if (ucNames.contains(STAR_WRITER_DOCUMENT)) {
            return SDW;
        } else if (ucNames.contains(STAR_DRAW_DOCUMENT_3)) {
            if (root == null) {
                /*
                 * This is either StarOfficeDraw or StarOfficeImpress, we have
                 * to consult the CompObj to distinguish them, if this method is
                 * called in "legacy mode", without the root, just return
                 * x-tika-msoffice. The one-argument method is only for backward
                 * compatibility, if someone calls old API he/she can get the
                 * old result.
                 */
                return OLE;
            } else {
                return processCompObjFormatType(root);
            }
        } else if (ucNames.contains(WKS_SSWORK_BOOK)) {
            // This check has to be before names.contains("Workbook")
            // Works 7.0 spreadsheet files contain both
            // we want to avoid classifying this as Excel
            return XLR;
        } else if (ucNames.contains("BOOK")) {
            // Excel 95 or older, we won't be able to parse this....
            return XLS;
        } else if (ucNames.contains(WORD_DOCUMENT)) {
            return DOC;
        } else if (ucNames.contains(QUILL)) {
            return PUB;
        } else if (ucNames.contains(POWERPOINT_DOCUMENT)) {
            return PPT;
        } else if (ucNames.contains(VISIO_DOCUMENT)) {
            return VSD;
        } else if (ucNames.contains(OLE10_NATIVE_STRING)) {
            return OLE10_NATIVE;
        } else if (ucNames.contains(MAT_OST)) {
            // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
            return WPS;
        } else if (ucNames.contains(CONTENTS) && ucNames.contains(SPELLING)) {
            // Newer Works files
            return WPS;
        } else if (ucNames.contains(EQUATION_NATIVE)) {
            return MS_EQUATION;
        } else if (ucNames.contains(OCX_NAME)) {
            //active x control should be parsed as OLE, not COMP_OBJ -- TIKA-4091
            //TODO -- create a mime for active x
            return OLE;
        } else if (ucNames.contains(CONTENTS) && ucNames.contains(OBJ_INFO)) {
            return COMP_OBJ;
        } else if (ucNames.contains(CONTENTS) && ucNames.contains(COMP_OBJ_STRING)) {
            // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
            // If we have the Directory, check
            if (root != null) {
                MediaType type = processCompObjFormatType(root);
                if (type == WPS) {
                    return WPS;
                } else {
                    // Assume it's a general CompObj embedded resource
                    return COMP_OBJ;
                }
            } else {
                // Assume it's a general CompObj embedded resource
                return COMP_OBJ;
            }
        } else if (ucNames.contains(CONTENTS)) {
            // CONTENTS without SPELLING nor CompObj normally means some sort
            //  of embedded non-office file inside an OLE2 document
            // This is most commonly triggered on nested directories
            return OLE;
        } else if (ucNames.contains(COMP_OBJ_STRING) &&
                (ucNames.contains(PROPS) || ucNames.contains(PROPS_9) ||
                        ucNames.contains(PROPS_12))) {
            // Could be Project, look for common name patterns
            for (String name : ucNames) {
                if (mppDataMatch.matcher(name).matches()) {
                    return MPP;
                }
            }
        } else if (ucNames.contains(LAYER)) {
            //in one test file, also saw LayerSmallImage and LayerLargeImage
            //maybe add those if we get false positives?
            //in other test files there was a single entry for "Layer"
            return ESRI_LAYER;
        } else if (ucNames.contains(DGN_MF) && ucNames.contains(DGN_S) &&
                ucNames.contains(DGN_H)) {
            return DGN_8;
        } else {
            for (String name : ucNames) {
                if (name.startsWith(SUBSTG_1)) {
                    return MSG;
                }
            }
        }


        // Couldn't detect a more specific type
        return OLE;
    }