in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java [850:1029]
private void processControlWord(int param, PushbackInputStream in)
throws IOException, SAXException, TikaException {
// TODO: afN? (associated font number)
// TODO: do these alter text output...?
/*
} else if (equals("stshfdbch")) {
// font to be used by default in
// style sheet for East Asian chars
// arg N is font table entry
} else if (equals("stshfloch")) {
// font to be used by default in
// style sheet for ASCII chars
// arg N is font table entry
} else if (equals("stshfhich")) {
// font to be used by default in
// style sheet for High Ansi chars
// arg N is font table entry
} else if (equals("stshfbi")) {
// style sheet for Complex Scripts (BIDI) chars
// arg N is font table entry
*/
// TODO: inefficient that we check equals N times;
// we'd get better perf w/ real lexer (eg
// JFlex), which uses single-pass FSM to do cmp:
if (inHeader) {
if (equals("ansicpg")) {
// ANSI codepage
Charset cs = ANSICPG_MAP.get(param);
if (cs != null) {
globalCharset = cs;
}
} else if (equals("deff")) {
// Default font
globalDefaultFont = param;
} else if (equals("nofpages")) {
metadata.add(Office.PAGE_COUNT, Integer.toString(param));
} else if (equals("nofwords")) {
metadata.add(Office.WORD_COUNT, Integer.toString(param));
} else if (equals("nofchars")) {
metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
} else if (equals("yr")) {
year = param;
} else if (equals("mo")) {
month = param;
} else if (equals("dy")) {
day = param;
} else if (equals("hr")) {
hour = param;
} else if (equals("min")) {
minute = param;
}
if (fontTableState == 1) {
// Still inside font table -- record the
// mappings of fN to the fcharset:
if (groupState.depth < fontTableDepth) {
fontTableState = 2;
} else {
if (equals("f")) {
// Start new font definition
curFontID = param;
} else if (equals("fcharset")) {
Charset cs = FCHARSET_MAP.get(param);
if (cs != null) {
fontToCharset.put(curFontID, cs);
}
}
}
}
//if you've already seen the font table,
//you aren't in another header item (e.g. styles)
//and you see an fX, you're out of the header
if (fontTableState == 2 && !groupState.ignore && equals("f")) {
inHeader = false;
}
if (currentList != null) {
if (equals("listid")) {
currentList.id = param;
currentListTable.put(currentList.id, currentList);
} else if (equals("listtemplateid")) {
currentList.templateID = param;
} else if (equals("levelnfc") || equals("levelnfcn")) {
//check to make sure list information isn't corrupt
if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
currentList.numberType[listTableLevel] = param;
}
}
}
} else {
// In document
if (equals("b")) {
// b0
assert param == 0;
//only modify styles if we're not in a hyperlink
if (fieldState == 0) {
if (groupState.bold) {
pushText();
if (groupState.italic) {
end("i");
}
end("b");
if (groupState.italic) {
start("i");
}
groupState.bold = false;
}
}
} else if (equals("i")) {
// i0
assert param == 0;
//only modify styles if we're not in a hyperlink
if (fieldState == 0) {
if (groupState.italic) {
pushText();
end("i");
groupState.italic = false;
}
}
} else if (equals("f")) {
// Change current font
Charset fontCharset = fontToCharset.get(param);
// Push any buffered text before changing
// font:
pushText();
if (fontCharset != null) {
groupState.fontCharset = fontCharset;
} else {
// DOC ERROR: font change referenced a
// non-table'd font number
// TODO: log a warning? Throw an exc?
groupState.fontCharset = null;
}
} else if (equals("ls")) {
groupState.list = param;
} else if (equals("lslvl")) {
groupState.listLevel = param;
} else if (equals("wbitmap")) {
embObjHandler.setPictBitmap(true);
}
}
// Process unicode escape. This can appear in doc
// or in header, since the metadata (info) fields
// in the header can be unicode escaped as well:
if (equals("u")) {
// Unicode escape
if (!groupState.ignore || groupState.sv || groupState.sn) {
final char utf16CodeUnit = (char) (param & 0xffff);
addOutputChar(utf16CodeUnit);
}
// After seeing a unicode escape we must
// skip the next ucSkip ansi chars (the
// "unicode shadow")
ansiSkip = groupState.ucSkip;
} else if (equals("uc")) {
// Change unicode shadow length
groupState.ucSkip = param;
} else if (equals("bin")) {
if (param >= 0) {
if (groupState.pictDepth == 1) {
try {
embObjHandler.writeBytes(in, param);
} catch (IOException | TikaException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
embObjHandler.reset();
}
} else {
IOUtils.skipFully(in, param);
}
} else {
// log some warning?
}
}
}