in src/core/storage/sframe_data/csv_line_tokenizer.cpp [461:638]
bool csv_line_tokenizer::tokenize_line_impl(char* str,
size_t len,
Fn add_token,
Fn2 lookahead,
Fn3 canceltoken) {
ASSERT_MSG(parser, "Uninitialized tokenizer.");
const char* buf = &(str[0]);
const char* bufend= buf + len;
const char* raw_field_begin = buf;
const char* delimiter_begin = delimiter.c_str();
const char* delimiter_end = delimiter_begin + delimiter.length();
bool good = true;
bool keep_parsing = true;
// we switched state to start_field by encountering a delimiter
bool start_field_with_delimiter_encountered = false;
// this is set to true for the character immediately after an escape character
// and false all other times
bool escape_sequence = false;
tokenizer_impl_fail_pos = -1;
tokenizer_state state = tokenizer_state::START_FIELD;
field_buffer_len = 0;
if (delimiter_is_new_line) {
add_token(str, len, str, len);
return true;
}
// this is adaptive. It can be either " or ' as we encounter it
while(keep_parsing && buf != bufend) {
// Next character in file
bool is_delimiter = DELIMITER_TEST();
// since escape_sequence can only be true for one character after it is
// set to true. I need a flag here. if reset_escape_sequence is true, the
// at the end of the loop, I clear escape_sequence
bool reset_escape_sequence = escape_sequence;
// skip to the last character of the delimiter
if (is_delimiter) buf += delimiter.length() - 1;
char c = *buf++;
switch(state) {
case tokenizer_state::START_FIELD:
/* expecting field */
// clear the flag
raw_field_begin = buf-1;
if (c == quote_char) {
// start quoted field
start_field_with_delimiter_encountered = false;
if (preserve_quoting == false) {
BEGIN_FIELD();
PUSH_CHAR(c);
state = tokenizer_state::IN_QUOTED_FIELD;
} else {
BEGIN_FIELD();
PUSH_CHAR(c);
state = tokenizer_state::IN_FIELD;
}
} else if (is_space_but_not_tab(c) && skip_initial_space) {
// do nothing
} else if (is_delimiter) {
/* save empty field */
start_field_with_delimiter_encountered = true;
// advance buffer
BEGIN_FIELD();
END_FIELD();
// otherwise if we are joining consecutive delimiters, do nothing
} else if (has_comment_char && c == comment_char) {
// comment line
start_field_with_delimiter_encountered = false;
END_LINE();
} else if (c == '[' || c == '{') {
const char* prev = buf;
start_field_with_delimiter_encountered = false;
buf--; // shift back so we are on top of the bracketing character
if (lookahead(&buf, bufend)) {
// ok we have successfully parsed a field.
// drop whitespace
while(buf < bufend && std::isspace(*buf)) ++buf;
if (buf == bufend) {
continue;
} else if (DELIMITER_TEST()) {
start_field_with_delimiter_encountered = true;
// skip past the delimiter
buf += delimiter.length();
continue;
} else if(delimiter_is_space_but_not_tab) {
// the lookahead parser may absorb whitespace
// so if the delimiter is a whitespace, we immediately
// advance to the next field
continue;
} else {
// bad. the lookahead picked up a whole field. But
// we do not see a delimiter.
// fail the lookahead
canceltoken();
buf = prev;
goto REGULAR_CHARACTER;
}
} else {
buf = prev;
// interpret as a regular character
goto REGULAR_CHARACTER;
}
} else {
REGULAR_CHARACTER:
start_field_with_delimiter_encountered = false;
/* begin new unquoted field */
PUSH_CHAR(c);
state = tokenizer_state::IN_FIELD;
}
break;
case tokenizer_state::IN_FIELD:
/* in unquoted field */
if (is_delimiter) {
// End of field. End of line not reached yet
END_FIELD();
// advance buffer
start_field_with_delimiter_encountered = true;
state = tokenizer_state::START_FIELD;
} else if (has_comment_char && c == comment_char) {
// terminate this field
END_FIELD();
state = tokenizer_state::START_FIELD;
END_LINE();
} else {
/* normal character - save in field */
PUSH_CHAR(c);
}
break;
case tokenizer_state::IN_QUOTED_FIELD:
/* in quoted field */
if (c == quote_char && !escape_sequence) {
if (double_quote) {
/* doublequote; " represented by "" */
// look ahead one character
// we are committed to preserving the buffer *exactly* here
// so push two quotes
if (buf + 1 < bufend && *buf == quote_char) {
PUSH_CHAR(c);
PUSH_CHAR(c);
++buf;
break;
}
}
/* end of quote part of field */
PUSH_CHAR(c);
state = tokenizer_state::IN_FIELD;
}
else {
/* normal character - save in field */
PUSH_CHAR(c);
}
break;
}
if (reset_escape_sequence) escape_sequence = false;
}
if (!good) {
tokenizer_impl_fail_pos = (ssize_t)(buf - str);
return false;
}
// cleanup
if (state != tokenizer_state::START_FIELD) {
if (!add_token(&(field_buffer[0]), field_buffer_len, raw_field_begin, buf - raw_field_begin)) {
tokenizer_impl_fail_pos = (ssize_t)(buf - str);
return false;
}
} else {
if (start_field_with_delimiter_encountered) {
if (!add_token(nullptr, 0, nullptr, 0)) {
tokenizer_impl_fail_pos = (ssize_t)(buf - str);
return false;
}
}
}
return true;
}