in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java [182:338]
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
//TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
if (lastStartElementWasP && !PPR.equals(localName)) {
bodyContentsHandler.startParagraph(currPProperties);
}
lastStartElementWasP = false;
if (uri != null && uri.equals(MC_NS)) {
if (CHOICE.equals(localName)) {
inACChoiceDepth++;
} else if (FALLBACK.equals(localName)) {
inACFallbackDepth++;
}
}
if (inACChoiceDepth > 0) {
return;
}
if (!includeTextBox && localName.equals(TEXTBOX)) {
inTextBox = true;
return;
}
//these are sorted descending by frequency within docx files
//in our regression corpus.
//yes, I know, likely premature optimization...
if (RPR.equals(localName)) {
inRPr = true;
} else if (R.equals(localName)) {
inR = true;
} else if (T.equals(localName)) {
inT = true;
} else if (TAB.equals(localName)) {
runBuffer.append(TAB_CHAR);
} else if (P.equals(localName)) {
lastStartElementWasP = true;
} else if (B.equals(localName)) { //TODO: add bCs
if (inR && inRPr) {
currRunProperties.setBold(true);
}
} else if (TC.equals(localName)) {
bodyContentsHandler.startTableCell();
} else if (P_STYLE.equals(localName)) {
String styleId = atts.getValue(W_NS, "val");
currPProperties.setStyleID(styleId);
} else if (I.equals(localName)) { //TODO: add iCs
//rprs don't have to be inR; ignore those that aren't
if (inR && inRPr) {
currRunProperties.setItalics(true);
}
} else if (STRIKE.equals(localName)) {
if (inR && inRPr) {
currRunProperties.setStrike(true);
}
} else if (U.equals(localName)) {
if (inR && inRPr) {
currRunProperties.setUnderline(getStringVal(atts));
}
} else if (TR.equals(localName)) {
bodyContentsHandler.startTableRow();
} else if (NUM_PR.equals(localName)) {
inNumPr = true;
} else if (ILVL.equals(localName)) {
if (inNumPr) {
currPProperties.setIlvl(getIntVal(atts));
}
} else if (NUM_ID.equals(localName)) {
if (inNumPr) {
currPProperties.setNumId(getIntVal(atts));
}
} else if (BR.equals(localName)) {
runBuffer.append(NEWLINE);
} else if (BOOKMARK_START.equals(localName)) {
String name = atts.getValue(W_NS, "name");
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.startBookmark(id, name);
} else if (BOOKMARK_END.equals(localName)) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.endBookmark(id);
} else if (HYPERLINK.equals(localName)) { //docx hyperlink
String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
String hyperlink = null;
if (hyperlinkId != null) {
hyperlink = linkedRelationships.get(hyperlinkId);
bodyContentsHandler.hyperlinkStart(hyperlink);
} else {
String anchor = atts.getValue(W_NS, "anchor");
if (anchor != null) {
anchor = "#" + anchor;
}
bodyContentsHandler.hyperlinkStart(anchor);
}
} else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
String hyperlink = null;
if (hyperlinkId != null) {
hyperlink = linkedRelationships.get(hyperlinkId);
bodyContentsHandler.hyperlinkStart(hyperlink);
inHlinkClick = true;
}
} else if (TBL.equals(localName)) {
bodyContentsHandler.startTable();
} else if (BLIP.equals(localName)) { //check for DRAWING_NS
picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
} else if ("cNvPr".equals(localName)) { //check for PIC_NS?
picDescription = atts.getValue("", "descr");
} else if (PIC.equals(localName)) {
inPic = true; //check for PIC_NS?
} //TODO: add sdt, sdtPr, sdtContent goes here statistically
else if (FOOTNOTE_REFERENCE.equals(localName)) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.footnoteReference(id);
} else if (IMAGEDATA.equals(localName)) {
picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
picDescription = atts.getValue(O_NS, "title");
} else if (INS.equals(localName)) {
startEditedSection(editType.INSERT, atts);
} else if (DEL_TEXT.equals(localName)) {
inDelText = true;
} else if (DEL.equals(localName)) {
startEditedSection(editType.DELETE, atts);
} else if (MOVE_TO.equals(localName)) {
startEditedSection(EditType.MOVE_TO, atts);
} else if (MOVE_FROM.equals(localName)) {
startEditedSection(editType.MOVE_FROM, atts);
} else if (OLE_OBJECT.equals(localName)) { //check for O_NS?
String type = null;
String refId = null;
//TODO: clean this up and ...want to get ProgID?
for (int i = 0; i < atts.getLength(); i++) {
String attLocalName = atts.getLocalName(i);
String attValue = atts.getValue(i);
if (attLocalName.equals("Type")) {
type = attValue;
} else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) &&
attLocalName.equals("id")) {
refId = attValue;
}
}
if ("Embed".equals(type)) {
bodyContentsHandler.embeddedOLERef(refId);
}
} else if (CR.equals(localName)) {
runBuffer.append(NEWLINE);
} else if (ENDNOTE_REFERENCE.equals(localName)) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.endnoteReference(id);
} else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
inV = true;
} else if (RT.equals(localName)) {
inRt = true;
}
}