in cpp/src/arrow/csv/parser.cc [276:465]
Status ParseLine(ValueDescWriter* values_writer, DataWriter* parsed_writer,
const char* data, const char* data_end, bool is_final,
const char** out_data, const BulkFilter& bulk_filter) {
int32_t num_cols = 0;
char c;
const auto start = data;
DCHECK_GT(data_end, data);
auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
values_writer->BeginLine();
parsed_writer->BeginLine();
// The parsing state machine
// Special case empty lines: do we start with a newline separator?
c = *data;
if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
if (c == '\r') {
data++;
if (data < data_end && *data == '\n') {
data++;
}
goto EmptyLine;
}
if (c == '\n') {
data++;
goto EmptyLine;
}
}
FieldStart:
// At the start of a field
if (*data == options_.delimiter) {
// Empty cells are very common in some files, shortcut them
values_writer->StartField(false /* quoted */);
FinishField();
++data;
++num_cols;
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
goto FieldStart;
}
// Quoting is only recognized at start of field
if (SpecializedOptions::quoting &&
ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
++data;
values_writer->StartField(true /* quoted */);
goto InQuotedField;
} else {
values_writer->StartField(false /* quoted */);
goto InField;
}
InField:
// Inside a non-quoted part of a field
if (UseBulkFilter) {
const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end, bulk_filter);
if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
if (is_final) {
data = data_end;
}
goto AbortLine;
}
data = bulk_end;
} else {
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
}
c = *data++;
if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
c = *data++;
parsed_writer->PushFieldChar(c);
goto InField;
}
if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
goto FieldEnd;
}
if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
if (c == '\r') {
// In the middle of a newline separator?
if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
data++;
}
goto LineEnd;
}
if (c == '\n') {
goto LineEnd;
}
}
parsed_writer->PushFieldChar(c);
goto InField;
InQuotedField:
// Inside a quoted part of a field
if (UseBulkFilter) {
const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end, bulk_filter);
if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
if (is_final) {
data = data_end;
}
goto AbortLine;
}
data = bulk_end;
} else {
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
}
c = *data++;
if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
c = *data++;
parsed_writer->PushFieldChar(c);
goto InQuotedField;
}
if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
// Double-quoting
++data;
} else {
// End of single-quoting
goto InField;
}
}
parsed_writer->PushFieldChar(c);
goto InQuotedField;
FieldEnd:
// At the end of a field
FinishField();
++num_cols;
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
goto FieldStart;
LineEnd:
// At the end of line
FinishField();
++num_cols;
if (ARROW_PREDICT_FALSE(num_cols != batch_.num_cols_)) {
if (batch_.num_cols_ == -1) {
batch_.num_cols_ = num_cols;
} else {
return HandleInvalidRow(values_writer, parsed_writer, start, data, num_cols,
out_data);
}
}
++batch_.num_rows_;
*out_data = data;
return Status::OK();
AbortLine:
// Not a full line except perhaps if in final block
if (is_final) {
goto LineEnd;
}
// Truncated line at end of block, rewind parsed state
values_writer->RollbackLine();
parsed_writer->RollbackLine();
return Status::OK();
EmptyLine:
if (!options_.ignore_empty_lines) {
if (batch_.num_cols_ == -1) {
// Consider as single value
batch_.num_cols_ = 1;
}
// Record as row of empty (null?) values
while (num_cols++ < batch_.num_cols_) {
values_writer->StartField(false /* quoted */);
FinishField();
}
++batch_.num_rows_;
}
*out_data = data;
return Status::OK();
}