bool Csv_reader::Decoder::decode()

in src/mlio/csv_reader.cc [627:755]


bool Csv_reader::Decoder<Col_iter>::decode(std::size_t row_idx, const Instance &instance)
{
    auto col_pos = col_beg_;

    auto tsr_pos = state_->tensors->begin();

    tokenizer_->reset(instance.bits());

    while (tokenizer_->next()) {
        if (col_pos == col_end_) {
            break;
        }

        // Check if we should skip this column.
        if (std::get<3>(*col_pos) != 0) {
            ++col_pos;

            continue;
        }

        // Check if we truncated the field.
        if (tokenizer_->truncated()) {
            auto h = state_->reader->params_.max_field_length_handling;

            if (h == Max_field_length_handling::treat_as_bad ||
                h == Max_field_length_handling::truncate_warn) {
                const std::string &name = std::get<1>(*col_pos);

                auto msg = fmt::format(
                    "The column '{2}' of the row #{1:n} in the data store '{0}' is too long. Its truncated value is '{3:.64}'.",
                    instance.data_store().id(),
                    instance.index(),
                    name,
                    tokenizer_->value());

                if (h == Max_field_length_handling::truncate_warn) {
                    logger::warn(msg);
                }
                else {
                    if (state_->warn_bad_instance || state_->error_bad_example) {
                        if (state_->warn_bad_instance) {
                            logger::warn(msg);
                        }

                        if (state_->error_bad_example) {
                            throw Invalid_instance_error{msg};
                        }
                    }

                    return false;
                }
            }
            else if (h != Max_field_length_handling::truncate) {
                throw std::invalid_argument{
                    "The specified maximum field length handling is invalid."};
            }
        }

        const Parser &parser = std::get<4>(*col_pos);

        auto &dense_tensor = static_cast<Dense_tensor &>(**tsr_pos);

        Parse_result r = parser(tokenizer_->value(), dense_tensor.data(), row_idx);
        if (r == Parse_result::ok) {
            ++col_pos;
            ++tsr_pos;

            continue;
        }

        if (state_->warn_bad_instance || state_->error_bad_example) {
            const std::string &name = std::get<1>(*col_pos);

            Data_type dt = std::get<2>(*col_pos);

            auto msg = fmt::format(
                "The column '{2}' of the row #{1:n} in the data store '{0}' cannot be parsed as {3}. Its string value is '{4:.64}'.",
                instance.data_store().id(),
                instance.index(),
                name,
                dt,
                tokenizer_->value());

            if (state_->warn_bad_instance) {
                logger::warn(msg);
            }

            if (state_->error_bad_example) {
                throw Invalid_instance_error{msg};
            }
        }

        return false;
    }

    // Make sure we read all columns and there are no remaining fields.
    if (col_pos == col_end_ && tokenizer_->eof()) {
        return true;
    }

    if (state_->warn_bad_instance || state_->error_bad_example) {
        std::size_t num_columns = state_->reader->column_names_.size();

        std::size_t num_actual_cols = std::get<0>(*col_pos);
        while (tokenizer_->next()) {
            num_actual_cols++;
        }
        if (col_pos == col_end_) {
            num_actual_cols++;
        }

        auto msg = fmt::format(
            "The row #{1:n} in the data store '{0}' has {2:n} column(s) while it is expected to have {3:n} column(s).",
            instance.data_store().id(),
            instance.index(),
            num_actual_cols,
            num_columns);

        if (state_->warn_bad_instance) {
            logger::warn(msg);
        }

        if (state_->error_bad_example) {
            throw Invalid_instance_error{msg};
        }
    }

    return false;
}