in hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [189:331]
public boolean nextField() throws IOException {
fieldCount++;
switch (state) {
case INIT:
case EOR:
case EOF:
case CR:
return false;
case IN_RECORD:
boolean eof;
// reset quote related values
startedQuote = false;
isDoubleQuoteIncludedInThisField = false;
lastQuotePosition = -99;
lastDoubleQuotePosition = -99;
quoteCount = 0;
doubleQuoteCount = 0;
int p = start;
while (true) {
if (p >= end) {
int s = start;
eof = !readMore();
p -= (s - start);
lastQuotePosition -= (s - start);
lastDoubleQuotePosition -= (s - start);
lastDelimiterPosition -= (s - start);
if (eof) {
state = State.EOF;
if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
&& quoteCount == doubleQuoteCount * 2 + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
} else {
fStart = start;
fEnd = p;
}
return true;
}
}
char ch = buffer[p];
if (ch == quote) {
// If this is first quote in the field, then it needs to be placed in the beginning.
if (!startedQuote) {
if (lastDelimiterPosition == p - 1 || lastDelimiterPosition == -99) {
startedQuote = true;
} else {
// In this case, we don't have a quote in the beginning of a field.
throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
+ " - a quote enclosing a field needs to be placed in the beginning of that field.");
}
}
// Check double quotes - "". We check [start != p-2]
// to avoid false positive where there is no value in a field,
// since it looks like a double quote. However, it's not a double quote.
// (e.g. if field2 has no value:
// field1,"",field3 ... )
if (lastQuotePosition == p - 1 && lastDelimiterPosition != p - 2
&& lastDoubleQuotePosition != p - 1) {
isDoubleQuoteIncludedInThisField = true;
doubleQuoteCount++;
lastDoubleQuotePosition = p;
}
lastQuotePosition = p;
quoteCount++;
} else if (ch == fieldDelimiter) {
// If there was no quote in the field,
// then we assume that the field contains a valid string.
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
lastDelimiterPosition = p;
return true;
} else if (startedQuote) {
if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
// There is a quote right before the delimiter (e.g. ",) and it is not two quote,
// then the field contains a valid string.
// We set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
start = p + 1;
lastDelimiterPosition = p;
startedQuote = false;
return true;
} else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
&& quoteCount == doubleQuoteCount * 2 + 2) {
// There is a quote before the delimiter, however it is not directly placed before the delimiter.
// In this case, we throw an exception.
// quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
+ " - A quote enclosing a field needs to be followed by the delimiter.");
}
}
// If the control flow reaches here: we have a delimiter in this field and
// there should be a quote in the beginning and the end of
// this field. So, just continue reading next character
} else if (ch == '\n') {
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
state = State.EOR;
lastDelimiterPosition = p;
return true;
} else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
&& quoteCount == doubleQuoteCount * 2 + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
lastDelimiterPosition = p;
start = p + 1;
state = State.EOR;
startedQuote = false;
return true;
}
} else if (ch == '\r') {
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
state = State.CR;
lastDelimiterPosition = p;
return true;
} else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
&& quoteCount == doubleQuoteCount * 2 + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
lastDelimiterPosition = p;
start = p + 1;
state = State.CR;
startedQuote = false;
return true;
}
}
++p;
}
}
throw new IllegalStateException();
}