private Set bufferedParseZipFile()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java [253:362]


    private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
                                         XHTMLContentHandler xhtml, Metadata metadata,
                                         ParseContext context, boolean isStrict)
            throws IOException, TikaException, SAXException, EpubZipException {

        String rootOPF = getRoot(zipFile, context);
        if (rootOPF == null) {
            throw new EpubZipException();
        }
        ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
        if (zae == null || !zipFile.canReadEntryData(zae)) {
            throw new EpubZipException();
        }
        opf.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);

        ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
        try (InputStream is = zipFile.getInputStream(zae)) {
            XMLReaderUtils.parseSAX(is, contentOrderScraper, context);
        }
        //if no content items, false
        if (contentOrderScraper.contentItems.size() == 0) {
            throw new EpubZipException();
        }
        String relativePath = "";
        if (rootOPF.lastIndexOf("/") > -1) {
            relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
        }

        if (isStrict) {
            int found = 0;
            for (String id : contentOrderScraper.contentItems) {
                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                if (hRefMediaPair != null && hRefMediaPair.href != null) {
                    zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                    if (zae != null && zipFile.canReadEntryData(zae)) {
                        found++;
                    }
                }
            }
            //if not perfect match btwn items and readable items
            //return false
            if (found != contentOrderScraper.contentItems.size()) {
                throw new EpubZipException();
            }
        }

        extractMetadata(zipFile, metadata, context);
        Set<String> encryptedItems = checkForDRM(zipFile);
        Set<String> processed = new HashSet<>();
        Set<SAXException> saxExceptions = new HashSet<>();
        for (String id : contentOrderScraper.contentItems) {
            HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
            if (hRefMediaPair != null && hRefMediaPair.href != null) {
                //we need to test for xhtml/xml because the content parser
                //expects that.
                boolean shouldParse = false;
                String href = hRefMediaPair.href.toLowerCase(Locale.US);
                if (hRefMediaPair.media != null) {
                    String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
                    if (mediaType.contains("html")) {
                        shouldParse = true;
                    }
                } else if (href.endsWith("htm") || href.endsWith("html") || href.endsWith(".xml")) {
                    shouldParse = true;
                }
                if (shouldParse) {
                    String path = relativePath + hRefMediaPair.href;
                    //if content is encrypted, do not parse it, throw an exception now
                    if (encryptedItems.contains(path)) {
                        maybeThrowEncryptedException(encryptedItems);
                    }
                    zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                    if (zae != null) {
                        try (InputStream is = zipFile.getInputStream(zae)) {
                            content.parse(is, bodyHandler, metadata, context);
                        } catch (SAXException e) {
                            if (WriteLimitReachedException.isWriteLimitReached(e)) {
                                throw e;
                            }
                            saxExceptions.add(e);
                        } finally {
                            processed.add(id);
                        }
                    }
                }
            }
        }

        //now handle embedded files
        EmbeddedDocumentExtractor embeddedDocumentExtractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        for (String id : contentOrderScraper.locationMap.keySet()) {
            if (!processed.contains(id)) {
                HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                String fullPath = relativePath + hRefMediaPair.href;
                if (encryptedItems.contains(fullPath)) {
                    continue;
                }
                if (shouldHandleEmbedded(hRefMediaPair.media)) {
                    handleEmbedded(zipFile, relativePath, hRefMediaPair, embeddedDocumentExtractor,
                            xhtml, metadata);
                }
            }
        }
        //throw SAXException if any from the parse of the body contents
        for (SAXException e : saxExceptions) {
            throw e;
        }
        return encryptedItems;
    }