private void processControlWord()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java [850:1029]


    private void processControlWord(int param, PushbackInputStream in)
            throws IOException, SAXException, TikaException {
        // TODO: afN?  (associated font number)

        // TODO: do these alter text output...?
        /*
            } else if (equals("stshfdbch")) {
                // font to be used by default in
                // style sheet for East Asian chars
                // arg N is font table entry
            } else if (equals("stshfloch")) {
                // font to be used by default in
                // style sheet for ASCII chars
                // arg N is font table entry
            } else if (equals("stshfhich")) {
                // font to be used by default in
                // style sheet for High Ansi chars
                // arg N is font table entry
            } else if (equals("stshfbi")) {
                // style sheet for Complex Scripts (BIDI) chars
                // arg N is font table entry
                */

        // TODO: inefficient that we check equals N times;
        // we'd get better perf w/ real lexer (eg
        // JFlex), which uses single-pass FSM to do cmp:
        if (inHeader) {
            if (equals("ansicpg")) {
                // ANSI codepage
                Charset cs = ANSICPG_MAP.get(param);
                if (cs != null) {
                    globalCharset = cs;
                }
            } else if (equals("deff")) {
                // Default font
                globalDefaultFont = param;
            } else if (equals("nofpages")) {
                metadata.add(Office.PAGE_COUNT, Integer.toString(param));
            } else if (equals("nofwords")) {
                metadata.add(Office.WORD_COUNT, Integer.toString(param));
            } else if (equals("nofchars")) {
                metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
            } else if (equals("yr")) {
                year = param;
            } else if (equals("mo")) {
                month = param;
            } else if (equals("dy")) {
                day = param;
            } else if (equals("hr")) {
                hour = param;
            } else if (equals("min")) {
                minute = param;
            }

            if (fontTableState == 1) {
                // Still inside font table -- record the
                // mappings of fN to the fcharset:
                if (groupState.depth < fontTableDepth) {
                    fontTableState = 2;
                } else {
                    if (equals("f")) {
                        // Start new font definition
                        curFontID = param;
                    } else if (equals("fcharset")) {
                        Charset cs = FCHARSET_MAP.get(param);
                        if (cs != null) {
                            fontToCharset.put(curFontID, cs);
                        }
                    }
                }
            }
            //if you've already seen the font table,
            //you aren't in another header item (e.g. styles)
            //and you see an fX, you're out of the header
            if (fontTableState == 2 && !groupState.ignore && equals("f")) {
                inHeader = false;
            }

            if (currentList != null) {
                if (equals("listid")) {
                    currentList.id = param;
                    currentListTable.put(currentList.id, currentList);
                } else if (equals("listtemplateid")) {
                    currentList.templateID = param;
                } else if (equals("levelnfc") || equals("levelnfcn")) {
                    //check to make sure list information isn't corrupt
                    if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
                        currentList.numberType[listTableLevel] = param;
                    }
                }
            }
        } else {
            // In document
            if (equals("b")) {
                // b0
                assert param == 0;
                //only modify styles if we're not in a hyperlink
                if (fieldState == 0) {
                    if (groupState.bold) {
                        pushText();
                        if (groupState.italic) {
                            end("i");
                        }
                        end("b");
                        if (groupState.italic) {
                            start("i");
                        }
                        groupState.bold = false;
                    }
                }
            } else if (equals("i")) {
                // i0
                assert param == 0;
                //only modify styles if we're not in a hyperlink
                if (fieldState == 0) {
                    if (groupState.italic) {
                        pushText();
                        end("i");
                        groupState.italic = false;
                    }
                }
            } else if (equals("f")) {
                // Change current font
                Charset fontCharset = fontToCharset.get(param);

                // Push any buffered text before changing
                // font:
                pushText();

                if (fontCharset != null) {
                    groupState.fontCharset = fontCharset;
                } else {
                    // DOC ERROR: font change referenced a
                    // non-table'd font number
                    // TODO: log a warning?  Throw an exc?
                    groupState.fontCharset = null;
                }
            } else if (equals("ls")) {
                groupState.list = param;
            } else if (equals("lslvl")) {
                groupState.listLevel = param;
            } else if (equals("wbitmap")) {
                embObjHandler.setPictBitmap(true);
            }
        }

        // Process unicode escape. This can appear in doc
        // or in header, since the metadata (info) fields
        // in the header can be unicode escaped as well:
        if (equals("u")) {
            // Unicode escape
            if (!groupState.ignore || groupState.sv || groupState.sn) {
                final char utf16CodeUnit = (char) (param & 0xffff);
                addOutputChar(utf16CodeUnit);
            }

            // After seeing a unicode escape we must
            // skip the next ucSkip ansi chars (the
            // "unicode shadow")
            ansiSkip = groupState.ucSkip;
        } else if (equals("uc")) {
            // Change unicode shadow length
            groupState.ucSkip = param;
        } else if (equals("bin")) {
            if (param >= 0) {
                if (groupState.pictDepth == 1) {
                    try {
                        embObjHandler.writeBytes(in, param);
                    } catch (IOException | TikaException e) {
                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                        embObjHandler.reset();
                    }
                } else {
                    IOUtils.skipFully(in, param);
                }
            } else {
                // log some warning?
            }
        }
    }