public void startElement()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java [182:338]


    public void startElement(String uri, String localName, String qName, Attributes atts)
            throws SAXException {
        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd

        if (lastStartElementWasP && !PPR.equals(localName)) {
            bodyContentsHandler.startParagraph(currPProperties);
        }

        lastStartElementWasP = false;

        if (uri != null && uri.equals(MC_NS)) {
            if (CHOICE.equals(localName)) {
                inACChoiceDepth++;
            } else if (FALLBACK.equals(localName)) {
                inACFallbackDepth++;
            }
        }

        if (inACChoiceDepth > 0) {
            return;
        }

        if (!includeTextBox && localName.equals(TEXTBOX)) {
            inTextBox = true;
            return;
        }
        //these are sorted descending by frequency within docx files
        //in our regression corpus.
        //yes, I know, likely premature optimization...
        if (RPR.equals(localName)) {
            inRPr = true;
        } else if (R.equals(localName)) {
            inR = true;
        } else if (T.equals(localName)) {
            inT = true;
        } else if (TAB.equals(localName)) {
            runBuffer.append(TAB_CHAR);
        } else if (P.equals(localName)) {
            lastStartElementWasP = true;
        } else if (B.equals(localName)) { //TODO: add bCs
            if (inR && inRPr) {
                currRunProperties.setBold(true);
            }
        } else if (TC.equals(localName)) {
            bodyContentsHandler.startTableCell();
        } else if (P_STYLE.equals(localName)) {
            String styleId = atts.getValue(W_NS, "val");
            currPProperties.setStyleID(styleId);
        } else if (I.equals(localName)) { //TODO: add iCs
            //rprs don't have to be inR; ignore those that aren't
            if (inR && inRPr) {
                currRunProperties.setItalics(true);
            }
        } else if (STRIKE.equals(localName)) {
            if (inR && inRPr) {
                currRunProperties.setStrike(true);
            }
        } else if (U.equals(localName)) {
            if (inR && inRPr) {
                currRunProperties.setUnderline(getStringVal(atts));
            }
        } else if (TR.equals(localName)) {
            bodyContentsHandler.startTableRow();
        } else if (NUM_PR.equals(localName)) {
            inNumPr = true;
        } else if (ILVL.equals(localName)) {
            if (inNumPr) {
                currPProperties.setIlvl(getIntVal(atts));
            }
        } else if (NUM_ID.equals(localName)) {
            if (inNumPr) {
                currPProperties.setNumId(getIntVal(atts));
            }
        } else if (BR.equals(localName)) {
            runBuffer.append(NEWLINE);
        } else if (BOOKMARK_START.equals(localName)) {
            String name = atts.getValue(W_NS, "name");
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.startBookmark(id, name);
        } else if (BOOKMARK_END.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.endBookmark(id);
        } else if (HYPERLINK.equals(localName)) { //docx hyperlink
            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            String hyperlink = null;
            if (hyperlinkId != null) {
                hyperlink = linkedRelationships.get(hyperlinkId);
                bodyContentsHandler.hyperlinkStart(hyperlink);
            } else {
                String anchor = atts.getValue(W_NS, "anchor");
                if (anchor != null) {
                    anchor = "#" + anchor;
                }
                bodyContentsHandler.hyperlinkStart(anchor);
            }
        } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            String hyperlink = null;
            if (hyperlinkId != null) {
                hyperlink = linkedRelationships.get(hyperlinkId);
                bodyContentsHandler.hyperlinkStart(hyperlink);
                inHlinkClick = true;
            }
        } else if (TBL.equals(localName)) {
            bodyContentsHandler.startTable();
        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
            picDescription = atts.getValue("", "descr");
        } else if (PIC.equals(localName)) {
            inPic = true; //check for PIC_NS?
        } //TODO: add sdt, sdtPr, sdtContent goes here statistically
        else if (FOOTNOTE_REFERENCE.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.footnoteReference(id);
        } else if (IMAGEDATA.equals(localName)) {
            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
            picDescription = atts.getValue(O_NS, "title");
        } else if (INS.equals(localName)) {
            startEditedSection(editType.INSERT, atts);
        } else if (DEL_TEXT.equals(localName)) {
            inDelText = true;
        } else if (DEL.equals(localName)) {
            startEditedSection(editType.DELETE, atts);
        } else if (MOVE_TO.equals(localName)) {
            startEditedSection(EditType.MOVE_TO, atts);
        } else if (MOVE_FROM.equals(localName)) {
            startEditedSection(editType.MOVE_FROM, atts);
        } else if (OLE_OBJECT.equals(localName)) { //check for O_NS?
            String type = null;
            String refId = null;
            //TODO: clean this up and ...want to get ProgID?
            for (int i = 0; i < atts.getLength(); i++) {
                String attLocalName = atts.getLocalName(i);
                String attValue = atts.getValue(i);
                if (attLocalName.equals("Type")) {
                    type = attValue;
                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) &&
                        attLocalName.equals("id")) {
                    refId = attValue;
                }
            }
            if ("Embed".equals(type)) {
                bodyContentsHandler.embeddedOLERef(refId);
            }
        } else if (CR.equals(localName)) {
            runBuffer.append(NEWLINE);
        } else if (ENDNOTE_REFERENCE.equals(localName)) {
            String id = atts.getValue(W_NS, "id");
            bodyContentsHandler.endnoteReference(id);
        } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
            inV = true;
        } else if (RT.equals(localName)) {
            inRt = true;
        }

    }