in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java [264:377]
private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap)
throws IOException, SAXException {
if (depth > MAX_RECURSION_DEPTH) {
throw new IOException(
new TikaException("Exceeded max recursion depth " + MAX_RECURSION_DEPTH));
}
if (kids instanceof COSArray) {
for (COSBase k : ((COSArray) kids)) {
recurse(k, currentPageRef, depth, paragraphs, roleMap);
}
} else if (kids instanceof COSObject &&
((COSObject) kids).getObject() instanceof COSDictionary) {
//TODO should be merged with COSDictionary segment below?
// and maybe dereference COSObject first, i.e. before the first "if"?
// No, because we're using the object key for a map
// However, we could replace ObjectRef with COSBase for currentPageRef.
// This way we could also get rid of findPages because that logic is in the
// iterator of PageTree which we get by calling PDDocument.getPages()
COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
COSName type = dict.getCOSName(COSName.TYPE);
if (COSName.OBJR.equals(type)) {
recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1, paragraphs,
roleMap);
}
COSName n = dict.getCOSName(COSName.S);
String name = "";
if (n != null) {
name = ((COSName) n).getName();
}
COSBase grandkids = dict.getItem(COSName.K);
if (grandkids == null) {
return;
}
COSBase pageBase = dict.getItem(COSName.PG);
if (pageBase instanceof COSObject) {
currentPageRef = new ObjectRef(((COSObject) pageBase).getKey().getNumber(),
((COSObject) pageBase).getKey().getGeneration());
}
HtmlTag tag = getTag(name, roleMap);
boolean startedLink = false;
boolean ignoreTag = false;
if ("link".equals(tag.clazz)) {
state.inLink = true;
startedLink = true;
}
if (!state.inLink) {
//TODO: currently suppressing span and lbody...
// is this what we want to do? What else should we suppress?
if ("span".equals(tag.tag)) {
ignoreTag = true;
} else if ("lbody".equals(tag.clazz)) {
ignoreTag = true;
}
if (!ignoreTag) {
if (tag.clazz != null && !tag.clazz.isBlank()) {
xhtml.startElement(tag.tag, "class", tag.clazz);
} else {
xhtml.startElement(tag.tag);
}
}
}
recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
if (startedLink) {
writeLink();
}
if (!state.inLink && !startedLink && !ignoreTag) {
xhtml.endElement(tag.tag);
}
} else if (kids instanceof COSInteger) {
int mcidInt = ((COSInteger) kids).intValue();
MCID mcid = new MCID(currentPageRef, mcidInt);
if (paragraphs.containsKey(mcid)) {
if (state.inLink) {
state.hrefAnchorBuilder.append(paragraphs.get(mcid));
} else {
try {
//if it isn't a uri, output this anyhow
writeString(paragraphs.get(mcid));
} catch (IOException e) {
handleCatchableIOE(e);
}
}
state.processedMCIDs.add(mcid);
} else {
//TODO: log can't find mcid
}
} else if (kids instanceof COSDictionary) {
//TODO: check for other types of dictionary?
COSDictionary dict = (COSDictionary) kids;
COSDictionary anchor = dict.getCOSDictionary(COSName.A);
//check for subtype /Link ?
//COSName subtype = obj.getCOSName(COSName.SUBTYPE);
if (anchor != null) {
state.uri = anchor.getString(COSName.URI);
} else {
if (dict.containsKey(COSName.K)) {
recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
paragraphs, roleMap);
} else if (dict.containsKey(COSName.OBJ)) {
recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
paragraphs, roleMap);
}
}
} else {
//TODO: handle a different object?
}
}