bool csv_line_tokenizer::tokenize_line_impl()

in src/core/storage/sframe_data/csv_line_tokenizer.cpp [461:638]


bool csv_line_tokenizer::tokenize_line_impl(char* str,
                                            size_t len,
                                            Fn add_token,
                                            Fn2 lookahead,
                                            Fn3 canceltoken) {
  ASSERT_MSG(parser, "Uninitialized tokenizer.");
  const char* buf = &(str[0]);
  const char* bufend= buf + len;
  const char* raw_field_begin = buf;
  const char* delimiter_begin = delimiter.c_str();
  const char* delimiter_end = delimiter_begin + delimiter.length();
  bool good = true;
  bool keep_parsing = true;
  // we switched state to start_field by encountering a delimiter
  bool start_field_with_delimiter_encountered = false;
  // this is set to true for the character immediately after an escape character
  // and false all other times
  bool escape_sequence = false;
  tokenizer_impl_fail_pos = -1;
  tokenizer_state state = tokenizer_state::START_FIELD;
  field_buffer_len = 0;
  if (delimiter_is_new_line) {
    add_token(str, len, str, len);
    return true;
  }

  // this is adaptive. It can be either " or ' as we encounter it

  while(keep_parsing && buf != bufend) {
    // Next character in file
    bool is_delimiter = DELIMITER_TEST();
    // since escape_sequence can only be true for one character after it is
    // set to true. I need a flag here. if reset_escape_sequence is true, the
    // at the end of the loop, I clear escape_sequence
    bool reset_escape_sequence = escape_sequence;
    // skip to the last character of the delimiter
    if (is_delimiter) buf += delimiter.length() - 1;

    char c = *buf++;
    switch(state) {

     case tokenizer_state::START_FIELD:
       /* expecting field */
       // clear the flag
       raw_field_begin = buf-1;
       if (c == quote_char) {
         // start quoted field
         start_field_with_delimiter_encountered = false;
         if (preserve_quoting == false) {
           BEGIN_FIELD();
           PUSH_CHAR(c);
           state = tokenizer_state::IN_QUOTED_FIELD;
         } else {
           BEGIN_FIELD();
           PUSH_CHAR(c);
           state = tokenizer_state::IN_FIELD;
         }
       } else if (is_space_but_not_tab(c) && skip_initial_space) {
         // do nothing
       } else if (is_delimiter) {
         /* save empty field */
         start_field_with_delimiter_encountered = true;
         // advance buffer
         BEGIN_FIELD();
         END_FIELD();
         // otherwise if we are joining consecutive delimiters, do nothing
       } else if (has_comment_char && c == comment_char) {
         // comment line
         start_field_with_delimiter_encountered = false;
         END_LINE();
       } else if (c == '[' || c == '{') {
         const char* prev = buf;
         start_field_with_delimiter_encountered = false;
         buf--; // shift back so we are on top of the bracketing character
         if (lookahead(&buf, bufend)) {
           // ok we have successfully parsed a field.
           // drop whitespace
           while(buf < bufend && std::isspace(*buf)) ++buf;
           if (buf == bufend) {
             continue;
           } else if (DELIMITER_TEST()) {
             start_field_with_delimiter_encountered = true;
             // skip past the delimiter
             buf += delimiter.length();
             continue;
           } else if(delimiter_is_space_but_not_tab) {
             // the lookahead parser may absorb whitespace
             // so if the delimiter is a whitespace, we immediately
             // advance to the next field
             continue;
           } else {
             // bad. the lookahead picked up a whole field. But
             // we do not see a delimiter.
             // fail the lookahead
             canceltoken();
             buf = prev;
             goto REGULAR_CHARACTER;
           }
         } else {
           buf = prev;
           // interpret as a regular character
           goto REGULAR_CHARACTER;
         }
       } else {
REGULAR_CHARACTER:
         start_field_with_delimiter_encountered = false;
         /* begin new unquoted field */
         PUSH_CHAR(c);
         state = tokenizer_state::IN_FIELD;
       }
       break;

     case tokenizer_state::IN_FIELD:
       /* in unquoted field */
       if (is_delimiter) {
         // End of field. End of line not reached yet
         END_FIELD();
         // advance buffer
         start_field_with_delimiter_encountered = true;
         state = tokenizer_state::START_FIELD;
       } else if (has_comment_char && c == comment_char) {
         // terminate this field
         END_FIELD();
         state = tokenizer_state::START_FIELD;
         END_LINE();
       } else {
         /* normal character - save in field */
         PUSH_CHAR(c);
       }
       break;

     case tokenizer_state::IN_QUOTED_FIELD:
       /* in quoted field */
       if (c == quote_char && !escape_sequence) {
         if (double_quote) {
           /* doublequote; " represented by "" */
           // look ahead one character
           // we are committed to preserving the buffer *exactly* here
           // so push two quotes
           if (buf + 1 < bufend && *buf == quote_char) {
             PUSH_CHAR(c);
             PUSH_CHAR(c);
             ++buf;
             break;
           }
         }
         /* end of quote part of field */
         PUSH_CHAR(c);
         state = tokenizer_state::IN_FIELD;
       }
       else {
         /* normal character - save in field */
         PUSH_CHAR(c);
       }
       break;
    }
    if (reset_escape_sequence) escape_sequence = false;
  }
  if (!good) {
    tokenizer_impl_fail_pos = (ssize_t)(buf - str);
    return false;
  }
  // cleanup
  if (state != tokenizer_state::START_FIELD) {
    if (!add_token(&(field_buffer[0]), field_buffer_len, raw_field_begin, buf - raw_field_begin)) {
      tokenizer_impl_fail_pos = (ssize_t)(buf - str);
      return false;
    }
  } else {
    if (start_field_with_delimiter_encountered) {
      if (!add_token(nullptr, 0, nullptr, 0)) {
        tokenizer_impl_fail_pos = (ssize_t)(buf - str);
        return false;
      }
    }
  }
  return true;
}