in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java [69:171]
protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
throws IOException, SAXException {
//special handling for undo must come first
if (!includeDeletedContent) {
if (inUndo && c != 241) {
return;
}
}
//241 is the fixed length multi-byte marker for
//undo/insert. The second byte determines
//what type of undo this is. I don't understand
//what the third byte signifies.
if (c == 241) {
byte b = in.readWPByte();
if (b == START_UNDO) {
inUndo = true;
} else if (b == END_UNDO) {
inUndo = false;
}
// removing 2 from function length since first two chars already read
in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 2);
} else if (c > 0 && c <= 32) {
out.append(WP6Charsets.DEFAULT_EXTENDED_INTL_CHARS[c]);
} else if (c >= 33 && c <= 126) {
out.append((char) c);
} else if (c == 128) {
out.append(' '); // Soft space
} else if (c == 129) {
out.append('\u00A0'); // Hard space
} else if (c == 132) {
out.append('-'); // Hard hyphen
} else if (c == 135 || c == 137) {
endParagraph(out, xhtml); // Dormant Hard return
} else if (c == 138) {
// skip to closing pair surrounding page number
skipUntilChar(in, 139);
} else if (c == 198) {
// end of cell
out.append('\t');
} else if (c >= 180 && c <= 207) {
endParagraph(out, xhtml);
// 208-239: variable-length multi-byte function
} else if (c >= 208 && c <= 239) {
int subgroup = in.readWP();
int functionSize = in.readWPShort();
for (int i = 0; i < functionSize - 4; i++) {
in.readWP();
}
// End-of-Line group
if (c == 208) {
if (subgroup >= 1 && subgroup <= 3) {
out.append(' ');
} else if (subgroup == 10) {
// end of cell
out.append('\t');
} else if (subgroup >= 4 && subgroup <= 19) {
endParagraph(out, xhtml);
} else if (subgroup >= 20 && subgroup <= 22) {
out.append(' ');
} else if (subgroup >= 23 && subgroup <= 28) {
endParagraph(out, xhtml);
}
} else if (c == 213) {
out.append(' ');
} else if (c == 224) {
out.append('\t');
}
//TODO Are there functions containing data? Like footnotes?
} else if (c == 240) {
// extended char
int charval = in.readWP();
int charset = in.readWP();
in.readWP(); // closing character
WP6Charsets.append(out, charset, charval);
// 241-254: fixed-length multi-byte function
} else if (c >= 241 && c <= 254) {
// removing 1 from function length since first char already read
in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
} else if (c == 255) {
// Should not be used so this line should not be called.
// We still have this code in case a future version uses it.
skipUntilChar(in, c);
}
// Ignored codes above 127:
// 130,131,133: soft hyphens
// 134: invisible return in line
// 136: soft end of center/align
// 140: style separator mark
// 141,142: start/end of text to skip
// 143: exited hyphenation
// 144: cancel hyphenation
// 145-151: match functions
// 152-179: unknown/ignored
// 255: reserved, cannot be used
}