in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java [220:321]
private void doParse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException {
tmpPos = 0;
outPos = 0;
xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
int i = 0;
do {
inSize = 0;
while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
inSize += i;
}
inPos = 0;
while (inPos < inSize) {
byte c = input[inPos++];
boolean utf8 = false;
/*
* Test for a possible UTF8 encoded char
*/
if (c == (byte) 0xC3) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = (byte) (c_ + 0x40);
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE) {
flushBuffer();
}
/*
* Test for a possible UTF8 encoded char
*/
} else if (c == (byte) 0xC2) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = c_;
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE) {
flushBuffer();
}
}
if (!utf8)
/*
* Test if the byte is a valid char.
*/ {
if (isChar(c)) {
output[tmpPos++] = c;
if (tmpPos == BUF_SIZE) {
flushBuffer();
}
} else {
/*
* Test if the byte is an invalid char, marking a string
* end. If it is a zero, test 2 positions before or
* ahead for a valid char, meaning it marks the
* transition between ISO-8859-1 and UTF16 sequences.
*/
if (c != 0 || (inPos >= 3 && isChar(input[inPos - 3])) ||
(inPos + 1 < inSize && isChar(input[inPos + 1]))) {
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
if (tmpPos == BUF_SIZE) {
flushBuffer();
}
} else {
tmpPos = outPos;
}
}
}
}
}
} while (i != -1 && !Thread.currentThread().isInterrupted());
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
}
xhtml.characters(new String(output, 0, outPos, "windows-1252"));
xhtml.endDocument();
}