public Result nextField()

in hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [273:427]


    public Result nextField() throws IOException {
        switch (state) {
            case INIT:
            case EOR:
            case EOF:
            case CR:
                return Result.END;
            case FAILED:
                return Result.ERROR;

            case IN_RECORD:
                fieldCount++;
                // reset quote related values
                startedQuote = false;
                containsEscapedQuotes = false;
                lastQuotePosition = -1;
                lastEscapedQuotePosition = -1;
                lastEscapePosition = -1;
                quoteCount = 0;
                escapedQuoteCount = 0;

                char lastChar = '\0';
                int p = start;
                while (true) {
                    if (p >= end) {
                        int s = start;
                        boolean eof = !readMore();
                        p -= (s - start);
                        lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0;
                        lastEscapedQuotePosition -= (lastEscapedQuotePosition > -1) ? (s - start) : 0;
                        lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0;
                        if (eof) {
                            state = State.EOF;
                            if (!startedQuote) {
                                fStart = start;
                                fEnd = p;
                            } else {
                                if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
                                        && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
                                    // set the position of fStart to +1, fEnd to -1 to remove quote character
                                    fStart = start + 1;
                                    fEnd = p - 1;
                                } else {
                                    state = State.FAILED;
                                    if (warnings.shouldWarn()) {
                                        warn(CLOSING_Q);
                                    }
                                    return Result.ERROR;
                                }
                            }
                            return Result.OK;
                        }
                    }
                    char ch = buffer[p];
                    if (ch == quote) {
                        // If this is first quote in the field, then it needs to be placed in the beginning.
                        if (!startedQuote) {
                            if (p == start) {
                                startedQuote = true;
                            } else {
                                // In this case, we don't have a quote in the beginning of a field.
                                state = State.FAILED;
                                if (warnings.shouldWarn()) {
                                    warn(OPENING_Q);
                                }
                                return Result.ERROR;
                            }
                        }
                        // Check escaped quotes - \ESC". We check [start != p-2] if escape is quote
                        // to avoid false positive where there is no value in a field,
                        // since it looks like an escaped quote. However, it's not an escaped quote.
                        // (e.g. if field2 has no value:
                        //       field1,"",field3 ... )
                        boolean couldBeEscaped = lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
                        boolean isEscapedQuote =
                                quote == escape ? couldBeEscaped && lastQuotePosition != start : couldBeEscaped;
                        if (isEscapedQuote) {
                            containsEscapedQuotes = true;
                            escapedQuoteCount++;
                            lastEscapedQuotePosition = p;
                        }
                        lastQuotePosition = p;
                        quoteCount++;
                    } else if (ch == fieldDelimiter) {
                        // If there was no quote in the field,
                        // then we assume that the field contains a valid string.
                        if (!startedQuote) {
                            fStart = start;
                            fEnd = p;
                            start = p + 1;
                            lastDelimiterPosition = p;
                            return Result.OK;
                        }

                        if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
                                && lastQuotePosition != start) {
                            // There is a quote right before the delimiter (e.g. ",)  and it is not an escaped quote,
                            // then the field contains a valid string.
                            // We set the position of fStart to +1, fEnd to -1 to remove quote character
                            fStart = start + 1;
                            fEnd = p - 1;
                            start = p + 1;
                            lastDelimiterPosition = p;
                            startedQuote = false;
                            return Result.OK;
                        } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastEscapedQuotePosition
                                && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
                            // There is a quote before the delimiter, however it is not directly placed before the delimiter.
                            // In this case, we throw an exception.
                            // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
                            state = State.FAILED;
                            if (warnings.shouldWarn()) {
                                warn(DELIMITER_AFTER_Q);
                            }
                            return Result.ERROR;
                        }
                        // If the control flow reaches here: we have a delimiter in this field and
                        // there should be a quote in the beginning and the end of
                        // this field. So, just continue reading next character
                    } else if (ch == '\n' || ch == '\r') {
                        if (!startedQuote) {
                            fStart = start;
                            fEnd = p;
                            start = p + 1;
                            state = ch == '\n' ? State.EOR : State.CR;
                            lastDelimiterPosition = p;
                            return Result.OK;
                        } else if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
                                && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
                            // set the position of fStart to +1, fEnd to -1 to remove quote character
                            fStart = start + 1;
                            fEnd = p - 1;
                            lastDelimiterPosition = p;
                            start = p + 1;
                            state = ch == '\n' ? State.EOR : State.CR;
                            startedQuote = false;
                            return Result.OK;
                        }
                    }
                    if (ch == escape) {
                        //RFC4180 defines the escape character for quotes as quotes. however CSV is not a well-defined
                        //format, and so frequently nonstandard escaping such as C-style \ escaping is used.
                        //Therefore, we need to track potential escapes separately to support these cases.
                        lastEscapePosition = p;
                    }
                    // count lines inside quotes
                    if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
                        lineCount++;
                    }
                    lastChar = ch;
                    ++p;
                }
        }
        throw new IllegalStateException();
    }