in hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [153:271]
public boolean nextRecord() throws IOException {
fieldCount = 0;
while (true) {
switch (state) {
case INIT:
boolean eof = !readMore();
if (eof) {
state = State.EOF;
return false;
} else {
state = State.IN_RECORD;
return true;
}
case IN_RECORD:
int p = start;
char lastChar = '\0';
while (true) {
if (p >= end) {
int s = start;
eof = !readMore();
if (eof) {
state = State.EOF;
return start < end;
}
p -= (s - start);
lastQuotePosition -= (s - start);
lastEscapedQuotePosition -= (s - start);
lastDelimiterPosition -= (s - start);
}
char ch = buffer[p];
// We perform rough format correctness (delimiter, quote) check here
// to set the starting position of a record.
// In the field level, more checking will be conducted.
if (ch == escape) {
// this may or may not be an escape. the next character must be a quote for it to be.
lastEscapePosition = p;
}
if (ch == quote) {
boolean couldBeEscapedQuote =
lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
if (quote == escape) {
startedQuote = true;
// check two quotes in a row that aren't at the start of a field if quote is escape, e.g. ""
if (couldBeEscapedQuote && start != p - 1) {
lastEscapedQuotePosition = p;
}
} else {
if (couldBeEscapedQuote) {
lastEscapedQuotePosition = p;
}
}
lastQuotePosition = p;
} else if (ch == fieldDelimiter) {
if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) {
startedQuote = false;
lastDelimiterPosition = p;
}
} else if (ch == '\n' && !startedQuote) {
start = p + 1;
state = State.EOR;
lastDelimiterPosition = p;
break;
} else if (ch == '\r' && !startedQuote) {
start = p + 1;
state = State.CR;
lastDelimiterPosition = p;
break;
}
// count lines inside quotes
if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
lineCount++;
}
lastChar = ch;
++p;
}
break;
case CR:
if (start >= end) {
eof = !readMore();
if (eof) {
state = State.EOF;
return false;
}
}
char ch = buffer[start];
// if the next char "ch" is not \n, then count the \r
if (ch != '\n') {
lineCount++;
}
if (ch == '\n' && !startedQuote) {
++start;
state = State.EOR;
} else {
state = State.IN_RECORD;
return true;
}
case EOR:
lineCount++;
if (start >= end) {
eof = !readMore();
if (eof) {
state = State.EOF;
return false;
}
}
state = State.IN_RECORD;
lastDelimiterPosition = start;
return start < end;
case EOF:
return false;
case FAILED:
return false;
}
}
}