protected void extract()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java [69:171]


    protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
            throws IOException, SAXException {
        //special handling for undo must come first

        if (!includeDeletedContent) {
            if (inUndo && c != 241) {
                return;
            }
        }
        //241 is the fixed length multi-byte marker for
        //undo/insert.  The second byte determines
        //what type of undo this is.  I don't understand
        //what the third byte signifies.
        if (c == 241) {
            byte b = in.readWPByte();
            if (b == START_UNDO) {
                inUndo = true;
            } else if (b == END_UNDO) {
                inUndo = false;
            }
            // removing 2 from function length since first two chars already read
            in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 2);

        } else if (c > 0 && c <= 32) {
            out.append(WP6Charsets.DEFAULT_EXTENDED_INTL_CHARS[c]);
        } else if (c >= 33 && c <= 126) {
            out.append((char) c);
        } else if (c == 128) {
            out.append(' ');      // Soft space
        } else if (c == 129) {
            out.append('\u00A0'); // Hard space
        } else if (c == 132) {
            out.append('-');      // Hard hyphen
        } else if (c == 135 || c == 137) {
            endParagraph(out, xhtml); // Dormant Hard return
        } else if (c == 138) {
            // skip to closing pair surrounding page number
            skipUntilChar(in, 139);
        } else if (c == 198) {
            // end of cell
            out.append('\t');
        } else if (c >= 180 && c <= 207) {
            endParagraph(out, xhtml);

            // 208-239: variable-length multi-byte function
        } else if (c >= 208 && c <= 239) {
            int subgroup = in.readWP();
            int functionSize = in.readWPShort();
            for (int i = 0; i < functionSize - 4; i++) {
                in.readWP();
            }

            // End-of-Line group
            if (c == 208) {
                if (subgroup >= 1 && subgroup <= 3) {
                    out.append(' ');
                } else if (subgroup == 10) {
                    // end of cell
                    out.append('\t');
                } else if (subgroup >= 4 && subgroup <= 19) {
                    endParagraph(out, xhtml);
                } else if (subgroup >= 20 && subgroup <= 22) {
                    out.append(' ');
                } else if (subgroup >= 23 && subgroup <= 28) {
                    endParagraph(out, xhtml);
                }
            } else if (c == 213) {
                out.append(' ');
            } else if (c == 224) {
                out.append('\t');
            }
            //TODO Are there functions containing data? Like footnotes?

        } else if (c == 240) {
            // extended char
            int charval = in.readWP();
            int charset = in.readWP();
            in.readWP(); // closing character
            WP6Charsets.append(out, charset, charval);

            // 241-254: fixed-length multi-byte function
        } else if (c >= 241 && c <= 254) {
            // removing 1 from function length since first char already read
            in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
        } else if (c == 255) {
            // Should not be used so this line should not be called.
            // We still have this code in case a future version uses it.
            skipUntilChar(in, c);
        }

        // Ignored codes above 127:

        // 130,131,133: soft hyphens
        // 134: invisible return in line
        // 136: soft end of center/align
        // 140: style separator mark
        // 141,142: start/end of text to skip
        // 143: exited hyphenation
        // 144: cancel hyphenation
        // 145-151: match functions
        // 152-179: unknown/ignored
        // 255: reserved, cannot be used
    }