public boolean nextField()

in hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [189:331]


    public boolean nextField() throws IOException {
        fieldCount++;
        switch (state) {
            case INIT:
            case EOR:
            case EOF:
            case CR:
                return false;

            case IN_RECORD:
                boolean eof;
                // reset quote related values
                startedQuote = false;
                isDoubleQuoteIncludedInThisField = false;
                lastQuotePosition = -99;
                lastDoubleQuotePosition = -99;
                quoteCount = 0;
                doubleQuoteCount = 0;

                int p = start;
                while (true) {
                    if (p >= end) {
                        int s = start;
                        eof = !readMore();
                        p -= (s - start);
                        lastQuotePosition -= (s - start);
                        lastDoubleQuotePosition -= (s - start);
                        lastDelimiterPosition -= (s - start);
                        if (eof) {
                            state = State.EOF;
                            if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
                                    && quoteCount == doubleQuoteCount * 2 + 2) {
                                // set the position of fStart to +1, fEnd to -1 to remove quote character
                                fStart = start + 1;
                                fEnd = p - 1;
                            } else {
                                fStart = start;
                                fEnd = p;
                            }
                            return true;
                        }
                    }
                    char ch = buffer[p];
                    if (ch == quote) {
                        // If this is first quote in the field, then it needs to be placed in the beginning.
                        if (!startedQuote) {
                            if (lastDelimiterPosition == p - 1 || lastDelimiterPosition == -99) {
                                startedQuote = true;
                            } else {
                                // In this case, we don't have a quote in the beginning of a field.
                                throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
                                        + " - a quote enclosing a field needs to be placed in the beginning of that field.");
                            }
                        }
                        // Check double quotes - "". We check [start != p-2]
                        // to avoid false positive where there is no value in a field,
                        // since it looks like a double quote. However, it's not a double quote.
                        // (e.g. if field2 has no value:
                        //       field1,"",field3 ... )
                        if (lastQuotePosition == p - 1 && lastDelimiterPosition != p - 2
                                && lastDoubleQuotePosition != p - 1) {
                            isDoubleQuoteIncludedInThisField = true;
                            doubleQuoteCount++;
                            lastDoubleQuotePosition = p;
                        }
                        lastQuotePosition = p;
                        quoteCount++;
                    } else if (ch == fieldDelimiter) {
                        // If there was no quote in the field,
                        // then we assume that the field contains a valid string.
                        if (!startedQuote) {
                            fStart = start;
                            fEnd = p;
                            start = p + 1;
                            lastDelimiterPosition = p;
                            return true;
                        } else if (startedQuote) {
                            if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
                                // There is a quote right before the delimiter (e.g. ",)  and it is not two quote,
                                // then the field contains a valid string.
                                // We set the position of fStart to +1, fEnd to -1 to remove quote character
                                fStart = start + 1;
                                fEnd = p - 1;
                                start = p + 1;
                                lastDelimiterPosition = p;
                                startedQuote = false;
                                return true;
                            } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
                                    && quoteCount == doubleQuoteCount * 2 + 2) {
                                // There is a quote before the delimiter, however it is not directly placed before the delimiter.
                                // In this case, we throw an exception.
                                // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
                                throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
                                        + " -  A quote enclosing a field needs to be followed by the delimiter.");
                            }
                        }
                        // If the control flow reaches here: we have a delimiter in this field and
                        // there should be a quote in the beginning and the end of
                        // this field. So, just continue reading next character
                    } else if (ch == '\n') {
                        if (!startedQuote) {
                            fStart = start;
                            fEnd = p;
                            start = p + 1;
                            state = State.EOR;
                            lastDelimiterPosition = p;
                            return true;
                        } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
                                && quoteCount == doubleQuoteCount * 2 + 2) {
                            // set the position of fStart to +1, fEnd to -1 to remove quote character
                            fStart = start + 1;
                            fEnd = p - 1;
                            lastDelimiterPosition = p;
                            start = p + 1;
                            state = State.EOR;
                            startedQuote = false;
                            return true;
                        }
                    } else if (ch == '\r') {
                        if (!startedQuote) {
                            fStart = start;
                            fEnd = p;
                            start = p + 1;
                            state = State.CR;
                            lastDelimiterPosition = p;
                            return true;
                        } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
                                && quoteCount == doubleQuoteCount * 2 + 2) {
                            // set the position of fStart to +1, fEnd to -1 to remove quote character
                            fStart = start + 1;
                            fEnd = p - 1;
                            lastDelimiterPosition = p;
                            start = p + 1;
                            state = State.CR;
                            startedQuote = false;
                            return true;
                        }
                    }
                    ++p;
                }
        }
        throw new IllegalStateException();
    }