void Column_analyzer::analyze()

in src/mlio-py/mlio/contrib/insights/column_analyzer.cc [40:167]


void Column_analyzer::analyze(const mlio::Example &example) const
{
    std::size_t feature_idx = 0;

    for (auto pos = example.features().begin(); pos < example.features().end();
         ++pos, feature_idx++) {
        const auto &tensor = example.features()[feature_idx];
        const auto &dense_tsr = static_cast<const mlio::Dense_tensor &>(*tensor);
        auto cells = dense_tsr.data().as<std::string>();

        Column_analysis &stats = (*columns_)[feature_idx];

        // Used to update the mean.
        // We do this here to aggregate several entries together and avoid
        // potential numerical problems updating the mean a single entry at a
        // time.
        double numeric_column_sum = 0.0;
        std::size_t numeric_column_count = 0.0;
        std::size_t string_column_count = 0.0;
        std::size_t string_column_length_sum = 0;

        for (const std::string &cell : cells) {
            // Capture the first Example.
            if (stats.rows_seen == 0) {
                stats.example_value = cell;
            }

            stats.rows_seen++;

            // String analyzers.
            if (cell.empty()) {
                stats.str_empty_count++;

                // All other analysis is irrelevant if we have an empty string.
                continue;
            }

            stats.str_min_length = std::min(stats.str_min_length, cell.size());
            stats.str_max_length = std::max(stats.str_max_length, cell.size());

            stats.str_min_length_not_empty = std::min(stats.str_min_length_not_empty, cell.size());

            string_column_length_sum += cell.size();
            string_column_count++;

            stats.str_cardinality_estimator_.add(cell);

            std::istringstream iss(cell);
            std::string token;
            while (std::getline(iss, token, ' ')) {
                stats.str_vocab_cardinality_estimator_.add(token);
                stats.str_num_words++;
            }

            if (mlio::is_whitespace_only(cell)) {
                stats.str_only_whitespace_count++;

                // All other analysis is irrelevant if we only have whitespace.
                continue;
            }

            if (match_nan_values(cell, *null_like_values_)) {
                stats.str_null_like_count++;
            }

            // Numeric analyzers
            double as_float{};
            if (mlio::try_parse_float(cell, as_float) != mlio::Parse_result::ok ||
                std::isnan(as_float)) {
                stats.numeric_nan_count++;
            }
            else {
                stats.numeric_count++;
                if (!std::isnan(as_float) && !std::isinf(as_float)) {
                    stats.numeric_finite_count++;

                    numeric_column_sum += as_float;
                    numeric_column_count++;

                    if (stats.numeric_column_sample_.size() < max_sample_size) {
                        stats.numeric_column_sample_.push_back(as_float);
                    }

                    if ((std::abs(std::round(as_float) - as_float) <= 1.0e-5)) {
                        stats.numeric_int_count++;
                    }

                    if (std::isnan(stats.numeric_finite_min) ||
                        as_float < stats.numeric_finite_min) {
                        stats.numeric_finite_min = as_float;
                    }
                    if (std::isnan(stats.numeric_finite_max) ||
                        as_float > stats.numeric_finite_max) {
                        stats.numeric_finite_max = as_float;
                    }
                }
            }

            auto should_capture = capture_columns_->find(feature_idx) != capture_columns_->end();

            // Capture the values if specified.
            if (should_capture && !stats.str_captured_unique_values_overflowed) {
                if (stats.str_captured_unique_values.size() < max_capture_count_) {
                    ++stats.str_captured_unique_values[cell];
                }
                else if (stats.str_captured_unique_values.find(cell) ==
                         stats.str_captured_unique_values.end()) {
                    // If the value isn't present but we're not adding it
                    // because we're at a limit then we should flag that we
                    // have overflowed.
                    stats.str_captured_unique_values_overflowed = true;
                }
            }
        }

        // Update the mean of numeric values based on the entire range of values.
        auto ncc = static_cast<double>(numeric_column_count);
        auto nfc = static_cast<double>(stats.numeric_finite_count);
        double numeric_column_mean = numeric_column_sum / ncc;
        stats.numeric_finite_mean += (numeric_column_mean - stats.numeric_finite_mean) * ncc / nfc;

        // Update average length of string values baseed on entire range of values.
        auto scc = static_cast<double>(string_column_count);
        auto rows_seen = static_cast<double>(stats.rows_seen);
        double string_column_avg_length = static_cast<double>(string_column_length_sum) / scc;
        stats.str_avg_length += (string_column_avg_length - stats.str_avg_length) * scc / rows_seen;
    }
};