private void recurse()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java [264:377]


    private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
                         Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap)
            throws IOException, SAXException {

        if (depth > MAX_RECURSION_DEPTH) {
            throw new IOException(
                    new TikaException("Exceeded max recursion depth " + MAX_RECURSION_DEPTH));
        }

        if (kids instanceof COSArray) {
            for (COSBase k : ((COSArray) kids)) {
                recurse(k, currentPageRef, depth, paragraphs, roleMap);
            }
        } else if (kids instanceof COSObject && 
                ((COSObject) kids).getObject() instanceof COSDictionary) {
            //TODO should be merged with COSDictionary segment below?
            // and maybe dereference COSObject first, i.e. before the first "if"?
            // No, because we're using the object key for a map
            // However, we could replace ObjectRef with COSBase for currentPageRef. 
            // This way we could also get rid of findPages because that logic is in the
            // iterator of PageTree which we get by calling PDDocument.getPages()
            COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject();
            COSName type = dict.getCOSName(COSName.TYPE);
            if (COSName.OBJR.equals(type)) {
                recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1, paragraphs,
                        roleMap);
            }

            COSName n = dict.getCOSName(COSName.S);
            String name = "";
            if (n != null) {
                name = ((COSName) n).getName();
            }
            COSBase grandkids = dict.getItem(COSName.K);
            if (grandkids == null) {
                return;
            }
            COSBase pageBase = dict.getItem(COSName.PG);

            if (pageBase instanceof COSObject) {
                currentPageRef = new ObjectRef(((COSObject) pageBase).getKey().getNumber(),
                        ((COSObject) pageBase).getKey().getGeneration());
            }

            HtmlTag tag = getTag(name, roleMap);
            boolean startedLink = false;
            boolean ignoreTag = false;
            if ("link".equals(tag.clazz)) {
                state.inLink = true;
                startedLink = true;
            }
            if (!state.inLink) {
                //TODO: currently suppressing span and lbody...
                // is this what we want to do?  What else should we suppress?
                if ("span".equals(tag.tag)) {
                    ignoreTag = true;
                } else if ("lbody".equals(tag.clazz)) {
                    ignoreTag = true;
                }
                if (!ignoreTag) {
                    if (tag.clazz != null && !tag.clazz.isBlank()) {
                        xhtml.startElement(tag.tag, "class", tag.clazz);
                    } else {
                        xhtml.startElement(tag.tag);
                    }
                }
            }

            recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
            if (startedLink) {
                writeLink();
            }
            if (!state.inLink && !startedLink && !ignoreTag) {
                xhtml.endElement(tag.tag);
            }
        } else if (kids instanceof COSInteger) {
            int mcidInt = ((COSInteger) kids).intValue();
            MCID mcid = new MCID(currentPageRef, mcidInt);
            if (paragraphs.containsKey(mcid)) {
                if (state.inLink) {
                    state.hrefAnchorBuilder.append(paragraphs.get(mcid));
                } else {
                    try {
                        //if it isn't a uri, output this anyhow
                        writeString(paragraphs.get(mcid));
                    } catch (IOException e) {
                        handleCatchableIOE(e);
                    }
                }
                state.processedMCIDs.add(mcid);
            } else {
                //TODO: log can't find mcid
            }
        } else if (kids instanceof COSDictionary) {
            //TODO: check for other types of dictionary?
            COSDictionary dict = (COSDictionary) kids;
            COSDictionary anchor = dict.getCOSDictionary(COSName.A);
            //check for subtype /Link ?
            //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
            if (anchor != null) {
                state.uri = anchor.getString(COSName.URI);
            } else {
                if (dict.containsKey(COSName.K)) {
                    recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1,
                            paragraphs, roleMap);
                } else if (dict.containsKey(COSName.OBJ)) {
                    recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1,
                            paragraphs, roleMap);
                }
            }
        } else {
            //TODO: handle a different object?
        }
    }