private void extractParagraph()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java [181:312]


    private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
                                  XHTMLContentHandler xhtml)
            throws SAXException, XmlException, IOException {
        // If this paragraph is actually a whole new section, then
        //  it could have its own headers and footers
        // Check and handle if so
        XWPFHeaderFooterPolicy headerFooterPolicy = null;
        if (paragraph.getCTP().getPPr() != null) {
            CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
            if (ctSectPr != null && config.isIncludeHeadersAndFooters()) {
                headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
                extractHeaders(xhtml, headerFooterPolicy, listManager);
            }
        }

        // Is this a paragraph, or a heading?
        String tag = "p";
        String styleClass = null;
        //TIKA-2144 check that styles is not null
        if (paragraph.getStyleID() != null && styles != null) {
            XWPFStyle style = styles.getStyle(paragraph.getStyleID());

            if (style != null && style.getName() != null) {
                TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
                        paragraph.getPartType() == BodyType.TABLECELL);
                tag = tas.getTag();
                styleClass = tas.getStyleClass();
            }
        }

        if (styleClass == null) {
            xhtml.startElement(tag);
        } else {
            xhtml.startElement(tag, "class", styleClass);
        }

        writeParagraphNumber(paragraph, listManager, xhtml);

        // Output placeholder for any embedded docs:
        processEmbeddedObjects(paragraph.getRuns(), xhtml);

        // Attach bookmarks for the paragraph
        // (In future, we might put them in the right place, for now
        //  we just put them in the correct paragraph)
        for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
            CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
            xhtml.startElement("a", "name", bookmark.getName());
            xhtml.endElement("a");
        }

        Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();

        //hyperlinks may or may not have hyperlink ids
        String lastHyperlinkId = null;
        boolean inHyperlink = false;
        // Do the iruns
        for (IRunElement run : paragraph.getIRuns()) {
            if (run instanceof XWPFHyperlinkRun) {
                XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
                if (hyperlinkRun.getHyperlinkId() == null ||
                        !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
                    if (inHyperlink) {
                        //close out the old one
                        FormattingUtils.closeStyleTags(xhtml, formattingState);
                        xhtml.endElement("a");
                        inHyperlink = false;
                    }
                    lastHyperlinkId = hyperlinkRun.getHyperlinkId();
                    FormattingUtils.closeStyleTags(xhtml, formattingState);
                    XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
                    if (link != null && link.getURL() != null) {
                        xhtml.startElement("a", "href", link.getURL());
                        inHyperlink = true;
                    } else if (hyperlinkRun.getAnchor() != null &&
                            hyperlinkRun.getAnchor().length() > 0) {
                        xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
                        inHyperlink = true;
                    }
                }
            } else if (inHyperlink) {
                //if this isn't a hyperlink, but the last one was
                FormattingUtils.closeStyleTags(xhtml, formattingState);
                xhtml.endElement("a");
                lastHyperlinkId = null;
                inHyperlink = false;
            }

            if (run instanceof XWPFSDT) {
                FormattingUtils.closeStyleTags(xhtml, formattingState);
                processSDTRun((XWPFSDT) run, xhtml);
                //for now, we're ignoring formatting in sdt
                //if you hit an sdt reset to false
            } else {
                processRun((XWPFRun) run, paragraph, xhtml, formattingState);
            }
        }
        FormattingUtils.closeStyleTags(xhtml, formattingState);
        if (inHyperlink) {
            xhtml.endElement("a");
        }


        // Now do any comments for the paragraph
        XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
        String commentText = comments.getCommentText();
        if (commentText != null && commentText.length() > 0) {
            xhtml.characters(commentText);
        }

        String footnameText = paragraph.getFootnoteText();
        if (footnameText != null && footnameText.length() > 0) {
            xhtml.characters(footnameText + "\n");
        }

        // Also extract any paragraphs embedded in text boxes
        //Note "w:txbxContent//"...must look for all descendant paragraphs
        //not just the immediate children of txbxContent -- TIKA-2807
        if (config.isIncludeShapeBasedContent()) {
            for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
                    "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
                extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
                        paragraph.getBody()), listManager, xhtml);
            }
        }

        // Finish this paragraph
        xhtml.endElement(tag);

        if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) {
            extractFooters(xhtml, headerFooterPolicy, listManager);
        }
    }