public boolean nextRecord()

in hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [153:271]


    public boolean nextRecord() throws IOException {
        fieldCount = 0;
        while (true) {
            switch (state) {
                case INIT:
                    boolean eof = !readMore();
                    if (eof) {
                        state = State.EOF;
                        return false;
                    } else {
                        state = State.IN_RECORD;
                        return true;
                    }

                case IN_RECORD:
                    int p = start;
                    char lastChar = '\0';
                    while (true) {
                        if (p >= end) {
                            int s = start;
                            eof = !readMore();
                            if (eof) {
                                state = State.EOF;
                                return start < end;
                            }
                            p -= (s - start);
                            lastQuotePosition -= (s - start);
                            lastEscapedQuotePosition -= (s - start);
                            lastDelimiterPosition -= (s - start);
                        }
                        char ch = buffer[p];
                        // We perform rough format correctness (delimiter, quote) check here
                        // to set the starting position of a record.
                        // In the field level, more checking will be conducted.
                        if (ch == escape) {
                            // this may or may not be an escape. the next character must be a quote for it to be.
                            lastEscapePosition = p;
                        }
                        if (ch == quote) {
                            boolean couldBeEscapedQuote =
                                    lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
                            if (quote == escape) {
                                startedQuote = true;
                                // check two quotes in a row that aren't at the start of a field if quote is escape, e.g. ""
                                if (couldBeEscapedQuote && start != p - 1) {
                                    lastEscapedQuotePosition = p;
                                }
                            } else {
                                if (couldBeEscapedQuote) {
                                    lastEscapedQuotePosition = p;
                                }
                            }
                            lastQuotePosition = p;
                        } else if (ch == fieldDelimiter) {
                            if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) {
                                startedQuote = false;
                                lastDelimiterPosition = p;
                            }
                        } else if (ch == '\n' && !startedQuote) {
                            start = p + 1;
                            state = State.EOR;
                            lastDelimiterPosition = p;
                            break;
                        } else if (ch == '\r' && !startedQuote) {
                            start = p + 1;
                            state = State.CR;
                            lastDelimiterPosition = p;
                            break;
                        }
                        // count lines inside quotes
                        if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
                            lineCount++;
                        }
                        lastChar = ch;
                        ++p;
                    }
                    break;

                case CR:
                    if (start >= end) {
                        eof = !readMore();
                        if (eof) {
                            state = State.EOF;
                            return false;
                        }
                    }
                    char ch = buffer[start];
                    // if the next char "ch" is not \n, then count the \r
                    if (ch != '\n') {
                        lineCount++;
                    }
                    if (ch == '\n' && !startedQuote) {
                        ++start;
                        state = State.EOR;
                    } else {
                        state = State.IN_RECORD;
                        return true;
                    }

                case EOR:
                    lineCount++;
                    if (start >= end) {
                        eof = !readMore();
                        if (eof) {
                            state = State.EOF;
                            return false;
                        }
                    }
                    state = State.IN_RECORD;
                    lastDelimiterPosition = start;
                    return start < end;

                case EOF:
                    return false;
                case FAILED:
                    return false;
            }
        }
    }