private int handleParagraph()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java [272:391]


    private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
                                FieldsDocumentPart docPart, PicturesSource pictures,
                                PicturesTable pictureTable, ListManager listManager,
                                XHTMLContentHandler xhtml)
            throws SAXException, IOException, TikaException {
        // Note - a poi bug means we can't currently properly recurse
        //  into nested tables, so currently we don't
        if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
            Table t = r.getTable(p);
            xhtml.startElement("table");
            xhtml.startElement("tbody");
            for (int rn = 0; rn < t.numRows(); rn++) {
                TableRow row = t.getRow(rn);
                xhtml.startElement("tr");
                for (int cn = 0; cn < row.numCells(); cn++) {
                    TableCell cell = row.getCell(cn);
                    xhtml.startElement("td");

                    for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                        Paragraph cellP = cell.getParagraph(pn);
                        handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures,
                                pictureTable, listManager, xhtml);
                    }
                    xhtml.endElement("td");
                }
                xhtml.endElement("tr");
            }
            xhtml.endElement("tbody");
            xhtml.endElement("table");
            return (t.numParagraphs() - 1);
        }

        String text = p.text();
        if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
            // Skip empty paragraphs
            return 0;
        }

        TagAndStyle tas;
        String numbering = null;

        if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
            StyleDescription style =
                    document.getStyleSheet().getStyleDescription(p.getStyleIndex());
            if (style != null && style.getName() != null && style.getName().length() > 0) {
                if (p.isInList()) {
                    numbering = listManager.getFormattedNumber(p);
                }
                tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
            } else {
                tas = new TagAndStyle("p", null);
            }
        } else {
            tas = new TagAndStyle("p", null);
        }

        if (tas.getStyleClass() != null) {
            xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
        } else {
            xhtml.startElement(tas.getTag());
        }

        if (numbering != null) {
            xhtml.characters(numbering);
        }

        for (int j = 0; j < p.numCharacterRuns(); j++) {
            CharacterRun cr = p.getCharacterRun(j);

            // FIELD_BEGIN_MARK:
            if (cr.text().getBytes(UTF_8)[0] == 0x13) {
                Field field =
                        document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
                // 58 is an embedded document
                // 56 is a document link
                if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                    // Embedded Object: add a <div
                    // class="embedded" id="_X"/> so consumer can see where
                    // in the main text each embedded document
                    // occurred:
                    String id = "_unknown_id";
                    //this can return null (TIKA-1956)
                    CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
                    if (mscr != null) {
                        id = "_" + mscr.getPicOffset();
                    }
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                    attributes.addAttribute("", "id", "id", "CDATA", id);
                    xhtml.startElement("div", attributes);
                    xhtml.endElement("div");
                }
            }

            if (cr.text().equals("\u0013")) {
                j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
            } else if (cr.text().startsWith("\b")) { //\u0008"
                // Floating Picture(s)
                for (int pn = 0; pn < cr.text().length(); pn++) {
                    // Assume they're in the order from the unclaimed list...
                    Picture picture = pictures.nextUnclaimed();

                    // Output
                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
                }
            } else if (pictureTable.hasPicture(cr)) {
                // Inline Picture
                Picture picture = pictures.getFor(cr);
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            } else {
                handleCharacterRun(cr, tas.isHeading(), xhtml);
            }
        }

        closeStyleElements(false, xhtml);

        xhtml.endElement(tas.getTag());

        return 0;
    }