in src/core/storage/sframe_data/csv_line_tokenizer.cpp [296:414]
bool csv_line_tokenizer::parse_as(char** buf, size_t len,
const char* raw, size_t rawlen,
flexible_type& out, bool recursive_parse) {
if (raw != nullptr && only_raw_string_substitutions == true &&
check_substitutions(raw, rawlen, out)) return true;
if (only_raw_string_substitutions == false &&
check_substitutions(*buf, len, out)) return true;
bool parse_success = false;
// we are trying to parse a non-string, but this actually looks like a string
// to me. it might be some other type wrapped inside quote characters
if (recursive_parse &&
out.get_type() != flex_type_enum::STRING &&
out.get_type() != flex_type_enum::UNDEFINED &&
(*buf)[0] == quote_char && (*buf)[len - 1] == quote_char) {
flexible_type tmp(flex_type_enum::STRING);
// unescape the string inplace
// skip the quote characters
char* end_of_buf = (*buf) + len;
++(*buf);
if (len > 1) len -= 2;
size_t new_length =
unescape_string(*buf, len, use_escape_char, escape_char, quote_char, double_quote);
bool ret = parse_as(buf, new_length, nullptr, 0, out, false);
(*buf) = end_of_buf;
return ret;
}
/*
* *buf does not get modified if parsing fails
* *buf gets modified if parsing succeeds
*/
switch(out.get_type()) {
case flex_type_enum::INTEGER:
std::tie(out, parse_success) = parser->int_parse((const char**)buf, len);
break;
case flex_type_enum::FLOAT:
std::tie(out, parse_success) = parser->double_parse((const char**)buf, len);
break;
case flex_type_enum::VECTOR:
std::tie(out, parse_success) = parser->vector_parse((const char**)buf, len);
break;
case flex_type_enum::STRING:
// STRING
// right trim of the buffer. The
// whitespace management of the parser already
// takes care of the left trim
{
bool is_quoted = false;
while(len > 0 && std::isspace((*buf)[len - 1])) len--;
if (len >= 2 && (*buf)[0] == quote_char && (*buf)[len - 1] == quote_char) {
out.mutable_get<flex_string>() = std::string((*buf)+1, len-2);
is_quoted = true;
} else {
out.mutable_get<flex_string>() = std::string(*buf, len);
}
parse_success = true;
if (is_quoted) {
unescape_string(out.mutable_get<flex_string>(), use_escape_char, escape_char,
quote_char, double_quote);
}
break;
}
case flex_type_enum::DICT:
std::tie(out, parse_success) = parser->dict_parse((const char**)buf, len);
break;
case flex_type_enum::LIST:
std::tie(out, parse_success) = parser->recursive_parse((const char**)buf, len);
break;
case flex_type_enum::UNDEFINED:
{
std::tie(out, parse_success) = parser->general_flexible_type_parse((const char**)buf, len);
// can we recursively parse this if it is a string?
if (recursive_parse &&
parse_success &&
out.get_type() == flex_type_enum::STRING) {
// make the string a parse buffer
const flex_string& s = out.get<flex_string>();
const char* cbegin = s.c_str();
const char* c = cbegin;
size_t clen = s.length();
// trim trailing whitespace if any. (the parser will take care of
// any whitespace before
while(clen > 0 && std::isspace(c[clen - 1])) clen--;
// try to reparse
flexible_type out2(flex_type_enum::UNDEFINED);
bool parse_success2;
std::tie(out2, parse_success2) = parser->non_string_flexible_type_parse(&c, clen);
// parse was successful and we consumed the entire buffer
// that's the output then.
if (parse_success2 && (c - cbegin) == (int)clen) {
out = out2;
}
}
}
break;
default:
parse_success = false;
return false;
}
if (!na_values.empty()) {
// if it is a string, if it matches the string that was parsed, it is also
// an na_value
if (parse_success == true && out.get_type() == flex_type_enum::STRING) {
const char* c = out.get<flex_string>().c_str();
size_t clen = out.get<flex_string>().length();
for (const auto& na_value: na_values) {
if (na_value.length() == clen && strncmp(c, na_value.c_str(), clen) == 0) {
out.reset(flex_type_enum::UNDEFINED);
parse_success = true;
break;
}
}
}
}
return parse_success;
}