in hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [93:187]
public boolean nextRecord() throws IOException {
recordCount++;
fieldCount = 0;
while (true) {
switch (state) {
case INIT:
boolean eof = !readMore();
if (eof) {
state = State.EOF;
return false;
} else {
state = State.IN_RECORD;
return true;
}
case IN_RECORD:
int p = start;
while (true) {
if (p >= end) {
int s = start;
eof = !readMore();
if (eof) {
state = State.EOF;
return start < end;
}
p -= (s - start);
lastQuotePosition -= (s - start);
lastDoubleQuotePosition -= (s - start);
lastDelimiterPosition -= (s - start);
}
char ch = buffer[p];
// We perform rough format correctness (delimiter, quote) check here
// to set the starting position of a record.
// In the field level, more checking will be conducted.
if (ch == quote) {
startedQuote = true;
// check two quotes in a row - "". This is an escaped quote
if (lastQuotePosition == p - 1 && start != p - 1 && lastDoubleQuotePosition != p - 1) {
lastDoubleQuotePosition = p;
}
lastQuotePosition = p;
} else if (ch == fieldDelimiter) {
if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
startedQuote = false;
lastDelimiterPosition = p;
}
} else if (ch == '\n' && !startedQuote) {
start = p + 1;
state = State.EOR;
lastDelimiterPosition = p;
break;
} else if (ch == '\r' && !startedQuote) {
start = p + 1;
state = State.CR;
lastDelimiterPosition = p;
break;
}
++p;
}
break;
case CR:
if (start >= end) {
eof = !readMore();
if (eof) {
state = State.EOF;
return false;
}
}
char ch = buffer[start];
if (ch == '\n' && !startedQuote) {
++start;
state = State.EOR;
} else {
state = State.IN_RECORD;
return true;
}
case EOR:
if (start >= end) {
eof = !readMore();
if (eof) {
state = State.EOF;
return false;
}
}
state = State.IN_RECORD;
lastDelimiterPosition = start;
return start < end;
case EOF:
return false;
}
}
}