in src/mlio/csv_reader.cc [627:755]
bool Csv_reader::Decoder<Col_iter>::decode(std::size_t row_idx, const Instance &instance)
{
auto col_pos = col_beg_;
auto tsr_pos = state_->tensors->begin();
tokenizer_->reset(instance.bits());
while (tokenizer_->next()) {
if (col_pos == col_end_) {
break;
}
// Check if we should skip this column.
if (std::get<3>(*col_pos) != 0) {
++col_pos;
continue;
}
// Check if we truncated the field.
if (tokenizer_->truncated()) {
auto h = state_->reader->params_.max_field_length_handling;
if (h == Max_field_length_handling::treat_as_bad ||
h == Max_field_length_handling::truncate_warn) {
const std::string &name = std::get<1>(*col_pos);
auto msg = fmt::format(
"The column '{2}' of the row #{1:n} in the data store '{0}' is too long. Its truncated value is '{3:.64}'.",
instance.data_store().id(),
instance.index(),
name,
tokenizer_->value());
if (h == Max_field_length_handling::truncate_warn) {
logger::warn(msg);
}
else {
if (state_->warn_bad_instance || state_->error_bad_example) {
if (state_->warn_bad_instance) {
logger::warn(msg);
}
if (state_->error_bad_example) {
throw Invalid_instance_error{msg};
}
}
return false;
}
}
else if (h != Max_field_length_handling::truncate) {
throw std::invalid_argument{
"The specified maximum field length handling is invalid."};
}
}
const Parser &parser = std::get<4>(*col_pos);
auto &dense_tensor = static_cast<Dense_tensor &>(**tsr_pos);
Parse_result r = parser(tokenizer_->value(), dense_tensor.data(), row_idx);
if (r == Parse_result::ok) {
++col_pos;
++tsr_pos;
continue;
}
if (state_->warn_bad_instance || state_->error_bad_example) {
const std::string &name = std::get<1>(*col_pos);
Data_type dt = std::get<2>(*col_pos);
auto msg = fmt::format(
"The column '{2}' of the row #{1:n} in the data store '{0}' cannot be parsed as {3}. Its string value is '{4:.64}'.",
instance.data_store().id(),
instance.index(),
name,
dt,
tokenizer_->value());
if (state_->warn_bad_instance) {
logger::warn(msg);
}
if (state_->error_bad_example) {
throw Invalid_instance_error{msg};
}
}
return false;
}
// Make sure we read all columns and there are no remaining fields.
if (col_pos == col_end_ && tokenizer_->eof()) {
return true;
}
if (state_->warn_bad_instance || state_->error_bad_example) {
std::size_t num_columns = state_->reader->column_names_.size();
std::size_t num_actual_cols = std::get<0>(*col_pos);
while (tokenizer_->next()) {
num_actual_cols++;
}
if (col_pos == col_end_) {
num_actual_cols++;
}
auto msg = fmt::format(
"The row #{1:n} in the data store '{0}' has {2:n} column(s) while it is expected to have {3:n} column(s).",
instance.data_store().id(),
instance.index(),
num_actual_cols,
num_columns);
if (state_->warn_bad_instance) {
logger::warn(msg);
}
if (state_->error_bad_example) {
throw Invalid_instance_error{msg};
}
}
return false;
}