in hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java [273:427]
public Result nextField() throws IOException {
switch (state) {
case INIT:
case EOR:
case EOF:
case CR:
return Result.END;
case FAILED:
return Result.ERROR;
case IN_RECORD:
fieldCount++;
// reset quote related values
startedQuote = false;
containsEscapedQuotes = false;
lastQuotePosition = -1;
lastEscapedQuotePosition = -1;
lastEscapePosition = -1;
quoteCount = 0;
escapedQuoteCount = 0;
char lastChar = '\0';
int p = start;
while (true) {
if (p >= end) {
int s = start;
boolean eof = !readMore();
p -= (s - start);
lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0;
lastEscapedQuotePosition -= (lastEscapedQuotePosition > -1) ? (s - start) : 0;
lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0;
if (eof) {
state = State.EOF;
if (!startedQuote) {
fStart = start;
fEnd = p;
} else {
if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
&& quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
} else {
state = State.FAILED;
if (warnings.shouldWarn()) {
warn(CLOSING_Q);
}
return Result.ERROR;
}
}
return Result.OK;
}
}
char ch = buffer[p];
if (ch == quote) {
// If this is first quote in the field, then it needs to be placed in the beginning.
if (!startedQuote) {
if (p == start) {
startedQuote = true;
} else {
// In this case, we don't have a quote in the beginning of a field.
state = State.FAILED;
if (warnings.shouldWarn()) {
warn(OPENING_Q);
}
return Result.ERROR;
}
}
// Check escaped quotes - \ESC". We check [start != p-2] if escape is quote
// to avoid false positive where there is no value in a field,
// since it looks like an escaped quote. However, it's not an escaped quote.
// (e.g. if field2 has no value:
// field1,"",field3 ... )
boolean couldBeEscaped = lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
boolean isEscapedQuote =
quote == escape ? couldBeEscaped && lastQuotePosition != start : couldBeEscaped;
if (isEscapedQuote) {
containsEscapedQuotes = true;
escapedQuoteCount++;
lastEscapedQuotePosition = p;
}
lastQuotePosition = p;
quoteCount++;
} else if (ch == fieldDelimiter) {
// If there was no quote in the field,
// then we assume that the field contains a valid string.
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
lastDelimiterPosition = p;
return Result.OK;
}
if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
&& lastQuotePosition != start) {
// There is a quote right before the delimiter (e.g. ",) and it is not an escaped quote,
// then the field contains a valid string.
// We set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
start = p + 1;
lastDelimiterPosition = p;
startedQuote = false;
return Result.OK;
} else if (lastQuotePosition < p - 1 && lastQuotePosition != lastEscapedQuotePosition
&& quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// There is a quote before the delimiter, however it is not directly placed before the delimiter.
// In this case, we throw an exception.
// quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
state = State.FAILED;
if (warnings.shouldWarn()) {
warn(DELIMITER_AFTER_Q);
}
return Result.ERROR;
}
// If the control flow reaches here: we have a delimiter in this field and
// there should be a quote in the beginning and the end of
// this field. So, just continue reading next character
} else if (ch == '\n' || ch == '\r') {
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
state = ch == '\n' ? State.EOR : State.CR;
lastDelimiterPosition = p;
return Result.OK;
} else if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
&& quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
lastDelimiterPosition = p;
start = p + 1;
state = ch == '\n' ? State.EOR : State.CR;
startedQuote = false;
return Result.OK;
}
}
if (ch == escape) {
//RFC4180 defines the escape character for quotes as quotes. however CSV is not a well-defined
//format, and so frequently nonstandard escaping such as C-style \ escaping is used.
//Therefore, we need to track potential escapes separately to support these cases.
lastEscapePosition = p;
}
// count lines inside quotes
if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
lineCount++;
}
lastChar = ch;
++p;
}
}
throw new IllegalStateException();
}