in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java [1102:1378]
private void processControlWord() throws IOException, SAXException, TikaException {
if (inHeader) {
if (equals("ansi")) {
globalCharset = WINDOWS_1252;
} else if (equals("pca")) {
globalCharset = CP850;
} else if (equals("pc")) {
globalCharset = CP437;
} else if (equals("mac")) {
globalCharset = MAC_ROMAN;
}
if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
groupState.ignore = true;
} else if (equals("listtable")) {
currentListTable = listTable;
} else if (equals("listoverridetable")) {
currentListTable = listOverrideTable;
}
if (uprState == -1) {
// TODO: we can also parse \creatim, \revtim,
// \printim, \version, etc.
if (equals("author")) {
nextMetaData = TikaCoreProperties.CREATOR;
} else if (equals("title")) {
nextMetaData = TikaCoreProperties.TITLE;
} else if (equals("subject")) {
nextMetaData = DublinCore.SUBJECT;
} else if (equals("keywords")) {
nextMetaData = Office.KEYWORDS;
} else if (equals("category")) {
nextMetaData = OfficeOpenXMLCore.CATEGORY;
} else if (equals("comment")) {
nextMetaData = TikaCoreProperties.COMMENTS;
} else if (equals("company")) {
nextMetaData = OfficeOpenXMLExtended.COMPANY;
} else if (equals("manager")) {
nextMetaData = OfficeOpenXMLExtended.MANAGER;
} else if (equals("template")) {
nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
} else if (equals("creatim")) {
nextMetaData = TikaCoreProperties.CREATED;
}
}
if (fontTableState == 0) {
// Didn't see font table yet
if (equals("fonttbl")) {
fontTableState = 1;
fontTableDepth = groupState.depth;
}
} else if (fontTableState == 1) {
// Inside font table
if (groupState.depth < fontTableDepth) {
fontTableState = 2;
}
}
// List table handling
if (currentListTable != null) {
if (equals("list") || equals("listoverride")) {
currentList = new ListDescriptor();
listTableLevel = -1;
} else if (currentList != null) {
if (equals("liststylename")) {
currentList.isStyle = true;
} else if (equals("listlevel")) {
listTableLevel++;
}
}
}
if (!groupState.ignore &&
(equals("par") || equals("pard") || equals("sect") || equals("sectd") ||
equals("plain") || equals("ltrch") || equals("rtlch") ||
equals("htmlrtf") || equals("line"))) {
inHeader = false;
}
} else {
//only modify styles if we're not in a hyperlink
if (fieldState == 0) {
if (equals("b")) {
if (!groupState.bold) {
pushText();
lazyStartParagraph();
if (groupState.italic) {
// Make sure nesting is always <b><i>
end("i");
}
groupState.bold = true;
startStyles(groupState);
}
} else if (equals("i")) {
//START I
if (!groupState.italic) {
pushText();
lazyStartParagraph();
groupState.italic = true;
start("i");
}
}
}
}
final boolean ignored = groupState.ignore;
if (equals("pard")) {
// Reset styles
pushText();
endStyles(groupState);
if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
pendingListEnd();
}
} else if (equals("plain")) {
if (groupState.italic || groupState.bold) {
// Reset styles
pushText();
endStyles(groupState);
}
} else if (equals("par")) {
if (!ignored) {
endParagraph(true);
if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
pendingListEnd();
}
}
} else if (equals("shptxt")) {
pushText();
// Text inside a shape
groupState.ignore = false;
} else if (equals("chatn")) {
addOutputChar(SPACE);
pushText();
// Annotation ID
groupState.ignore = false;
} else if (equals("atnid")) {
addOutputChar(SPACE);
pushText();
// Annotation ID
groupState.ignore = false;
} else if (equals("atnauthor")) {
addOutputChar(SPACE);
pushText();
// Annotation author
groupState.ignore = false;
} else if (equals("annotation")) {
groupState.annotation = true;
pushText();
// Annotation
groupState.ignore = false;
} else if (equals("listtext")) {
groupState.ignore = true;
} else if (equals("cell")) {
// TODO: we should produce a table output here?
//addOutputChar(' ');
endParagraph(true);
} else if (equals("sp")) {
groupState.sp = true;
} else if (equals("sn")) {
embObjHandler.startSN();
groupState.sn = true;
} else if (equals("sv")) {
embObjHandler.startSV();
groupState.sv = true;
} else if (equals("object")) {
pushText();
embObjHandler.setInObject(true);
groupState.object = true;
} else if (equals("objdata")) {
groupState.objdata = true;
embObjHandler.startObjData();
} else if (equals("pict")) {
pushText();
// TODO: create img tag? but can that support
// embedded image data?
groupState.pictDepth = 1;
embObjHandler.startPict();
} else if (equals("line")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("column")) {
if (!ignored) {
addOutputChar(' ');
}
} else if (equals("page")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("softline")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("softcolumn")) {
if (!ignored) {
addOutputChar(' ');
}
} else if (equals("softpage")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("tab")) {
if (!ignored) {
addOutputChar('\t');
}
} else if (equals("upr")) {
uprState = 0;
} else if (equals("ud") && uprState == 1) {
uprState = -1;
// 2nd group inside the upr destination, which
// contains the unicode encoding of the text, so
// we want to keep that:
groupState.ignore = false;
} else if (equals("bullet")) {
if (!ignored) {
// unicode BULLET
addOutputChar('\u2022');
}
} else if (equals("endash")) {
if (!ignored) {
// unicode EN DASH
addOutputChar('\u2013');
}
} else if (equals("emdash")) {
if (!ignored) {
// unicode EM DASH
addOutputChar('\u2014');
}
} else if (equals("enspace")) {
if (!ignored) {
// unicode EN SPACE
addOutputChar('\u2002');
}
} else if (equals("qmspace")) {
if (!ignored) {
// quarter em space -> unicode FOUR-PER-EM SPACE
addOutputChar('\u2005');
}
} else if (equals("emspace")) {
if (!ignored) {
// unicode EM SPACE
addOutputChar('\u2003');
}
} else if (equals("lquote")) {
if (!ignored) {
// unicode LEFT SINGLE QUOTATION MARK
addOutputChar('\u2018');
}
} else if (equals("rquote")) {
if (!ignored) {
// unicode RIGHT SINGLE QUOTATION MARK
addOutputChar('\u2019');
}
} else if (equals("ldblquote")) {
if (!ignored) {
// unicode LEFT DOUBLE QUOTATION MARK
addOutputChar('\u201C');
}
} else if (equals("rdblquote")) {
if (!ignored) {
// unicode RIGHT DOUBLE QUOTATION MARK
addOutputChar('\u201D');
}
} else if (equals("fldinst")) {
fieldState = 1;
groupState.ignore = false;
} else if (equals("fldrslt") && fieldState == 2) {
assert pendingURL != null;
lazyStartParagraph();
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute(XHTML, "href", "href", "CDATA", pendingURL);
out.startElement("", "a", "a", attrs);
pendingURL = null;
fieldState = 3;
groupState.ignore = false;
}
}