in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java [181:312]
private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
// If this paragraph is actually a whole new section, then
// it could have its own headers and footers
// Check and handle if so
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (paragraph.getCTP().getPPr() != null) {
CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
if (ctSectPr != null && config.isIncludeHeadersAndFooters()) {
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(xhtml, headerFooterPolicy, listManager);
}
}
// Is this a paragraph, or a heading?
String tag = "p";
String styleClass = null;
//TIKA-2144 check that styles is not null
if (paragraph.getStyleID() != null && styles != null) {
XWPFStyle style = styles.getStyle(paragraph.getStyleID());
if (style != null && style.getName() != null) {
TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
paragraph.getPartType() == BodyType.TABLECELL);
tag = tas.getTag();
styleClass = tas.getStyleClass();
}
}
if (styleClass == null) {
xhtml.startElement(tag);
} else {
xhtml.startElement(tag, "class", styleClass);
}
writeParagraphNumber(paragraph, listManager, xhtml);
// Output placeholder for any embedded docs:
processEmbeddedObjects(paragraph.getRuns(), xhtml);
// Attach bookmarks for the paragraph
// (In future, we might put them in the right place, for now
// we just put them in the correct paragraph)
for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
xhtml.startElement("a", "name", bookmark.getName());
xhtml.endElement("a");
}
Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();
//hyperlinks may or may not have hyperlink ids
String lastHyperlinkId = null;
boolean inHyperlink = false;
// Do the iruns
for (IRunElement run : paragraph.getIRuns()) {
if (run instanceof XWPFHyperlinkRun) {
XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
if (hyperlinkRun.getHyperlinkId() == null ||
!hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
if (inHyperlink) {
//close out the old one
FormattingUtils.closeStyleTags(xhtml, formattingState);
xhtml.endElement("a");
inHyperlink = false;
}
lastHyperlinkId = hyperlinkRun.getHyperlinkId();
FormattingUtils.closeStyleTags(xhtml, formattingState);
XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
if (link != null && link.getURL() != null) {
xhtml.startElement("a", "href", link.getURL());
inHyperlink = true;
} else if (hyperlinkRun.getAnchor() != null &&
hyperlinkRun.getAnchor().length() > 0) {
xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
inHyperlink = true;
}
}
} else if (inHyperlink) {
//if this isn't a hyperlink, but the last one was
FormattingUtils.closeStyleTags(xhtml, formattingState);
xhtml.endElement("a");
lastHyperlinkId = null;
inHyperlink = false;
}
if (run instanceof XWPFSDT) {
FormattingUtils.closeStyleTags(xhtml, formattingState);
processSDTRun((XWPFSDT) run, xhtml);
//for now, we're ignoring formatting in sdt
//if you hit an sdt reset to false
} else {
processRun((XWPFRun) run, paragraph, xhtml, formattingState);
}
}
FormattingUtils.closeStyleTags(xhtml, formattingState);
if (inHyperlink) {
xhtml.endElement("a");
}
// Now do any comments for the paragraph
XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
String commentText = comments.getCommentText();
if (commentText != null && commentText.length() > 0) {
xhtml.characters(commentText);
}
String footnameText = paragraph.getFootnoteText();
if (footnameText != null && footnameText.length() > 0) {
xhtml.characters(footnameText + "\n");
}
// Also extract any paragraphs embedded in text boxes
//Note "w:txbxContent//"...must look for all descendant paragraphs
//not just the immediate children of txbxContent -- TIKA-2807
if (config.isIncludeShapeBasedContent()) {
for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
paragraph.getBody()), listManager, xhtml);
}
}
// Finish this paragraph
xhtml.endElement(tag);
if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) {
extractFooters(xhtml, headerFooterPolicy, listManager);
}
}