private void extractContent()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java [177:260]


    private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders,
                                XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
        for (XSLFShape sh : shapes) {

            if (sh instanceof XSLFTextShape) {
                XSLFTextShape txt = (XSLFTextShape) sh;
                Placeholder ph = txt.getTextType();
                if (skipPlaceholders && ph != null) {
                    continue;
                }
                boolean inHyperlink = false;
                for (XSLFTextParagraph p : txt.getTextParagraphs()) {
                    xhtml.startElement("p");
                    if (!config.isIncludeHeadersAndFooters() && p.isHeaderOrFooter()) {
                        continue;
                    }
                    for (XSLFTextRun run : p.getTextRuns()) {
                        //TODO: add check for targetmode=external into POI
                        //then check to confirm that the urls are actually
                        //external and not footnote refs via the current hack
                        Hyperlink hyperlink = run.getHyperlink();

                        if (hyperlink != null && hyperlink.getAddress() != null &&
                                !hyperlink.getAddress().contains("#_ftn")) {
                            xhtml.startElement("a", "href", hyperlink.getAddress());
                            inHyperlink = true;
                        }
                        xhtml.characters(run.getRawText());
                        if (inHyperlink == true) {
                            xhtml.endElement("a");
                        }
                        inHyperlink = false;
                    }
                    xhtml.endElement("p");
                }
            } else if (sh instanceof XSLFGroupShape) {
                // recurse into groups of shapes
                XSLFGroupShape group = (XSLFGroupShape) sh;
                extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
            } else if (sh instanceof XSLFTable) {
                //unlike tables in Word, ppt/x can't have recursive tables...I don't think
                extractTable((XSLFTable) sh, xhtml);
            } else if (sh instanceof XSLFGraphicFrame) {
                XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
                XmlObject[] sp = frame.getXmlObject().selectPath(
                        "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
                if (sp != null) {
                    for (XmlObject emb : sp) {
                        XmlObject relIDAtt = emb.selectAttribute(new QName(
                                "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
                                "id"));
                        if (relIDAtt != null) {
                            String relID = relIDAtt.getDomNode().getNodeValue();
                            if (slideDesc != null) {
                                relID = slideDesc + relID;
                            }
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
            } else if (sh instanceof XSLFPictureShape) {
                if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
                    CTPicture ctPic = ((CTPicture) sh.getXmlObject());
                    if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
                        String relID = ctPic.getBlipFill().getBlip().getEmbed();
                        if (relID != null) {
                            if (slideDesc != null) {
                                relID = slideDesc + relID;
                            }
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
            }
        }
    }