public static void parse()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java [92:272]


    public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
                             ParseContext context) throws IOException, SAXException, TikaException {
        Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());

        //if there's a problem opening the zip file;
        //create a tmp file, and copy what you can read of it.
        File tmpRepairedCopy = null;

        OPCPackage pkg = null;
        //if the pkg is in the opencontainer of a TikaInputStream, it will get closed.
        //However, if a regular inputstream has been sent in, we need to revert the pkg.
        boolean mustRevertPackage = false;
        try {
            OOXMLExtractor extractor = null;

            // Locate or Open the OPCPackage for the file
            TikaInputStream tis = TikaInputStream.cast(stream);
            if (tis != null && tis.getOpenContainer() instanceof OPCPackageWrapper) {
                pkg = ((OPCPackageWrapper) tis.getOpenContainer()).getOPCPackage();
            } else if (tis != null && tis.hasFile()) {
                try {
                    pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
                } catch (InvalidOperationException e) {
                    tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile();
                    ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
                    pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                }
                tis.setOpenContainer(new OPCPackageWrapper(pkg));
            } else {
                //OPCPackage slurps rris into memory so we can close rris
                //without apparent problems
                mustRevertPackage = true;
                try (RereadableInputStream rereadableInputStream = new RereadableInputStream(stream,
                        MAX_BUFFER_LENGTH, false)) {
                    try {
                        pkg = OPCPackage.open(CloseShieldInputStream.wrap(rereadableInputStream));
                    } catch (UnsupportedZipFeatureException e) {
                        if (e.getFeature() !=
                                UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
                            throw e;
                        }
                        rereadableInputStream.rewind();
                        tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile();
                        ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
                        //if there isn't enough left to be opened as a package
                        //throw an exception -- we may want to fall back to streaming
                        //parsing
                        pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                    } catch (IOException e) {
                        if (e instanceof EOFException) {
                            //keep going
                        } else if (e instanceof IOException && e.getMessage() != null &&
                                e.getMessage().contains("Truncated")) {
                            //keep going
                        } else {
                            throw e;
                        }
                        rereadableInputStream.rewind();
                        tmpRepairedCopy = Files.createTempFile("tika-ooxml-repair-", "").toFile();
                        ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
                        //if there isn't enough left to be opened as a package
                        //throw an exception -- we may want to fall back to streaming
                        //parsing
                        pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                    }
                }
            }

            if (pkg != null) {
                PackageRelationshipCollection prc =
                        pkg.getRelationshipsByType(OOXMLParser.SIGNATURE_RELATIONSHIP);
                if (prc != null && prc.size() > 0) {
                    metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
                }
            }

            MediaType type = null;
            String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
            if (mediaTypeString != null) {
                type = MediaType.parse(mediaTypeString);
            }
            if (type != null && OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
                // Not a supported type, delegate to Empty Parser
                EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
                return;
            }

            if (type == null || !OOXMLParser.SUPPORTED_TYPES.contains(type)) {
                // Get the type, and ensure it's one we handle
                type = OPCPackageDetector.detectOfficeOpenXML(pkg);
            }

            if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
                // Not a supported type, delegate to Empty Parser
                EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
                return;
            }
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            // Have the appropriate OOXML text extractor picked
            POIXMLTextExtractor poiExtractor = null;
            // This has already been set by OOXMLParser's call to configure()
            // We can rely on this being non-null.
            OfficeParserConfig config = context.get(OfficeParserConfig.class);
            if (config.isUseSAXDocxExtractor()) {
                poiExtractor = trySXWPF(pkg);
            }
            if (poiExtractor == null) {
                poiExtractor = tryXSLF(pkg, config.isUseSAXPptxExtractor());
            }
            if (type.equals(OOXMLParser.XPS)) {
                poiExtractor = new XPSTextExtractor(pkg);
            }

            if (poiExtractor == null) {
                poiExtractor = EXTRACTOR_FACTORY.create(pkg);
            }

            POIXMLDocument document = poiExtractor.getDocument();
            if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
                extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
            } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
            } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
                extractor = new SXWPFWordExtractorDecorator(metadata, context,
                        (XWPFEventBasedWordExtractor) poiExtractor);
                metadata.add(TikaCoreProperties.TIKA_PARSED_BY,
                        XWPFEventBasedWordExtractor.class.getCanonicalName());
            } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
                extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
                        (XSLFEventBasedPowerPointExtractor) poiExtractor);
                metadata.add(TikaCoreProperties.TIKA_PARSED_BY,
                        XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
            } else if (poiExtractor instanceof XPSTextExtractor) {
                extractor = new XPSExtractorDecorator(context, poiExtractor);
            } else if (document == null) {
                throw new TikaException(
                        "Expecting UserModel based POI OOXML extractor with a document, but none" +
                                " found. " +
                                "The extractor returned was a " + poiExtractor);
            } else if (document instanceof XMLSlideShow) {
                extractor = new XSLFPowerPointExtractorDecorator(metadata, context,
                        (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor);
            } else if (document instanceof XWPFDocument) {
                extractor = new XWPFWordExtractorDecorator(metadata, context,
                        (XWPFWordExtractor) poiExtractor);
            } else {
                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
            }


            // Get the bulk of the metadata first, so that it's accessible during
            //  parsing if desired by the client (see TIKA-1109)
            extractor.getMetadataExtractor().extract(metadata);

            // Extract the text, along with any in-document metadata
            extractor.getXHTML(baseHandler, metadata, context);
        } catch (IllegalArgumentException e) {
            if (e.getMessage() != null &&
                    e.getMessage().startsWith("No supported documents found")) {
                throw new TikaException("TIKA-418: RuntimeException while getting content" +
                        " for thmx and xps file types", e);
            } else {
                throw new TikaException("Error creating OOXML extractor", e);
            }
        } catch (OpenXML4JException | XmlException e) {
            throw new TikaException("Error creating OOXML extractor", e);
        } catch (RuntimeSAXException e) {
            throw (SAXException) e.getCause();
        } finally {
            if (pkg != null && mustRevertPackage) {
                pkg.revert();
            }
            if (tmpRepairedCopy != null) {
                boolean deleted = tmpRepairedCopy.delete();
                if (!deleted) {
                    LOG.warn("failed to delete tmp (repair) file: " +
                            tmpRepairedCopy.getAbsolutePath());
                }
            }
        }
    }