in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java [253:362]
private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context, boolean isStrict)
throws IOException, TikaException, SAXException, EpubZipException {
String rootOPF = getRoot(zipFile, context);
if (rootOPF == null) {
throw new EpubZipException();
}
ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
if (zae == null || !zipFile.canReadEntryData(zae)) {
throw new EpubZipException();
}
opf.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);
ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
try (InputStream is = zipFile.getInputStream(zae)) {
XMLReaderUtils.parseSAX(is, contentOrderScraper, context);
}
//if no content items, false
if (contentOrderScraper.contentItems.size() == 0) {
throw new EpubZipException();
}
String relativePath = "";
if (rootOPF.lastIndexOf("/") > -1) {
relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
}
if (isStrict) {
int found = 0;
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
if (hRefMediaPair != null && hRefMediaPair.href != null) {
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
if (zae != null && zipFile.canReadEntryData(zae)) {
found++;
}
}
}
//if not perfect match btwn items and readable items
//return false
if (found != contentOrderScraper.contentItems.size()) {
throw new EpubZipException();
}
}
extractMetadata(zipFile, metadata, context);
Set<String> encryptedItems = checkForDRM(zipFile);
Set<String> processed = new HashSet<>();
Set<SAXException> saxExceptions = new HashSet<>();
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
if (hRefMediaPair != null && hRefMediaPair.href != null) {
//we need to test for xhtml/xml because the content parser
//expects that.
boolean shouldParse = false;
String href = hRefMediaPair.href.toLowerCase(Locale.US);
if (hRefMediaPair.media != null) {
String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
if (mediaType.contains("html")) {
shouldParse = true;
}
} else if (href.endsWith("htm") || href.endsWith("html") || href.endsWith(".xml")) {
shouldParse = true;
}
if (shouldParse) {
String path = relativePath + hRefMediaPair.href;
//if content is encrypted, do not parse it, throw an exception now
if (encryptedItems.contains(path)) {
maybeThrowEncryptedException(encryptedItems);
}
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
if (zae != null) {
try (InputStream is = zipFile.getInputStream(zae)) {
content.parse(is, bodyHandler, metadata, context);
} catch (SAXException e) {
if (WriteLimitReachedException.isWriteLimitReached(e)) {
throw e;
}
saxExceptions.add(e);
} finally {
processed.add(id);
}
}
}
}
}
//now handle embedded files
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
for (String id : contentOrderScraper.locationMap.keySet()) {
if (!processed.contains(id)) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
String fullPath = relativePath + hRefMediaPair.href;
if (encryptedItems.contains(fullPath)) {
continue;
}
if (shouldHandleEmbedded(hRefMediaPair.media)) {
handleEmbedded(zipFile, relativePath, hRefMediaPair, embeddedDocumentExtractor,
xhtml, metadata);
}
}
}
//throw SAXException if any from the parse of the body contents
for (SAXException e : saxExceptions) {
throw e;
}
return encryptedItems;
}