bool csv_line_tokenizer::parse_as()

in src/core/storage/sframe_data/csv_line_tokenizer.cpp [296:414]


bool csv_line_tokenizer::parse_as(char** buf, size_t len,
                                  const char* raw, size_t rawlen,
                                  flexible_type& out, bool recursive_parse) {
  if (raw != nullptr && only_raw_string_substitutions == true &&
      check_substitutions(raw, rawlen, out)) return true;

  if (only_raw_string_substitutions == false &&
      check_substitutions(*buf, len, out)) return true;

  bool parse_success = false;
  // we are trying to parse a non-string, but this actually looks like a string
  // to me.  it might be some other type wrapped inside quote characters
  if (recursive_parse &&
      out.get_type() != flex_type_enum::STRING &&
      out.get_type() != flex_type_enum::UNDEFINED &&
      (*buf)[0] == quote_char && (*buf)[len - 1] == quote_char) {
    flexible_type tmp(flex_type_enum::STRING);
    // unescape the string inplace
    // skip the quote characters
    char* end_of_buf = (*buf) + len;
    ++(*buf);
    if (len > 1) len -= 2;
    size_t new_length =
        unescape_string(*buf, len, use_escape_char, escape_char, quote_char, double_quote);
    bool ret = parse_as(buf, new_length, nullptr, 0, out, false);
    (*buf) = end_of_buf;
    return ret;
  }

  /*
   *  *buf does not get modified if parsing fails
   *  *buf gets modified if parsing succeeds
   */
  switch(out.get_type()) {
   case flex_type_enum::INTEGER:
     std::tie(out, parse_success) = parser->int_parse((const char**)buf, len);
     break;
   case flex_type_enum::FLOAT:
     std::tie(out, parse_success) = parser->double_parse((const char**)buf, len);
     break;
   case flex_type_enum::VECTOR:
     std::tie(out, parse_success) = parser->vector_parse((const char**)buf, len);
     break;
   case flex_type_enum::STRING:
     // STRING
     // right trim of the buffer. The
     // whitespace management of the parser already
     // takes care of the left trim
     {
       bool is_quoted = false;
       while(len > 0 && std::isspace((*buf)[len - 1])) len--;
       if (len >= 2 && (*buf)[0] == quote_char && (*buf)[len - 1] == quote_char) {
         out.mutable_get<flex_string>() = std::string((*buf)+1, len-2);
         is_quoted = true;
       } else {
         out.mutable_get<flex_string>() = std::string(*buf, len);
       }
       parse_success = true;
       if (is_quoted) {
         unescape_string(out.mutable_get<flex_string>(), use_escape_char, escape_char,
                         quote_char, double_quote);
       }
       break;
     }
   case flex_type_enum::DICT:
     std::tie(out, parse_success) = parser->dict_parse((const char**)buf, len);
     break;
   case flex_type_enum::LIST:
     std::tie(out, parse_success) = parser->recursive_parse((const char**)buf, len);
     break;
   case flex_type_enum::UNDEFINED:
     {
       std::tie(out, parse_success) = parser->general_flexible_type_parse((const char**)buf, len);
       // can we recursively parse this if it is a string?
       if (recursive_parse &&
           parse_success &&
           out.get_type() == flex_type_enum::STRING) {
         // make the string a parse buffer
         const flex_string& s = out.get<flex_string>();
         const char* cbegin = s.c_str();
         const char* c = cbegin;
         size_t clen = s.length();
         // trim trailing whitespace if any. (the parser will take care of
         // any whitespace before
         while(clen > 0 && std::isspace(c[clen - 1])) clen--;
         // try to reparse
         flexible_type out2(flex_type_enum::UNDEFINED);
         bool parse_success2;
         std::tie(out2, parse_success2) = parser->non_string_flexible_type_parse(&c, clen);
         // parse was successful and we consumed the entire buffer
         // that's the output then.
         if (parse_success2 && (c - cbegin) == (int)clen) {
           out = out2;
         }
       }
     }
     break;
   default:
     parse_success = false;
     return false;
  }

  if (!na_values.empty()) {
    // if it is a string, if it matches the string that was parsed, it is also
    // an na_value
    if (parse_success == true && out.get_type() == flex_type_enum::STRING) {
      const char* c = out.get<flex_string>().c_str();
      size_t clen = out.get<flex_string>().length();
      for (const auto& na_value: na_values) {
        if (na_value.length() == clen && strncmp(c, na_value.c_str(), clen) == 0) {
          out.reset(flex_type_enum::UNDEFINED);
          parse_success = true;
          break;
        }
      }
    }
  }
  return parse_success;
}