private void doParse()

in tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java [220:321]


    private void doParse(InputStream stream, ContentHandler handler, Metadata metadata,
                         ParseContext context) throws IOException, SAXException {

        tmpPos = 0;
        outPos = 0;

        xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        int i = 0;
        do {
            inSize = 0;
            while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
                inSize += i;
            }
            inPos = 0;
            while (inPos < inSize) {
                byte c = input[inPos++];
                boolean utf8 = false;
                /*
                 * Test for a possible UTF8 encoded char
                 */
                if (c == (byte) 0xC3) {
                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
                    /*
                     * Test if the next byte is in the valid UTF8 range
                     */
                    if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
                        utf8 = true;
                        output[tmpPos++] = (byte) (c_ + 0x40);
                    } else {
                        output[tmpPos++] = c;
                        c = c_;
                    }
                    if (tmpPos == BUF_SIZE) {
                        flushBuffer();
                    }

                    /*
                     * Test for a possible UTF8 encoded char
                     */
                } else if (c == (byte) 0xC2) {
                    byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
                    /*
                     * Test if the next byte is in the valid UTF8 range
                     */
                    if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
                        utf8 = true;
                        output[tmpPos++] = c_;
                    } else {
                        output[tmpPos++] = c;
                        c = c_;
                    }
                    if (tmpPos == BUF_SIZE) {
                        flushBuffer();
                    }
                }
                if (!utf8)
                    /*
                     * Test if the byte is a valid char.
                     */ {
                    if (isChar(c)) {
                        output[tmpPos++] = c;
                        if (tmpPos == BUF_SIZE) {
                            flushBuffer();
                        }
                    } else {
                        /*
                         * Test if the byte is an invalid char, marking a string
                         * end. If it is a zero, test 2 positions before or
                         * ahead for a valid char, meaning it marks the
                         * transition between ISO-8859-1 and UTF16 sequences.
                         */
                        if (c != 0 || (inPos >= 3 && isChar(input[inPos - 3])) ||
                                (inPos + 1 < inSize && isChar(input[inPos + 1]))) {

                            if (tmpPos - outPos >= minSize) {
                                output[tmpPos++] = 0x0A;
                                outPos = tmpPos;

                                if (tmpPos == BUF_SIZE) {
                                    flushBuffer();
                                }
                            } else {
                                tmpPos = outPos;
                            }

                        }
                    }
                }
            }
        } while (i != -1 && !Thread.currentThread().isInterrupted());

        if (tmpPos - outPos >= minSize) {
            output[tmpPos++] = 0x0A;
            outPos = tmpPos;
        }
        xhtml.characters(new String(output, 0, outPos, "windows-1252"));

        xhtml.endDocument();

    }