in src/mlio-py/mlio/contrib/insights/column_analyzer.cc [40:167]
void Column_analyzer::analyze(const mlio::Example &example) const
{
std::size_t feature_idx = 0;
for (auto pos = example.features().begin(); pos < example.features().end();
++pos, feature_idx++) {
const auto &tensor = example.features()[feature_idx];
const auto &dense_tsr = static_cast<const mlio::Dense_tensor &>(*tensor);
auto cells = dense_tsr.data().as<std::string>();
Column_analysis &stats = (*columns_)[feature_idx];
// Used to update the mean.
// We do this here to aggregate several entries together and avoid
// potential numerical problems updating the mean a single entry at a
// time.
double numeric_column_sum = 0.0;
std::size_t numeric_column_count = 0.0;
std::size_t string_column_count = 0.0;
std::size_t string_column_length_sum = 0;
for (const std::string &cell : cells) {
// Capture the first Example.
if (stats.rows_seen == 0) {
stats.example_value = cell;
}
stats.rows_seen++;
// String analyzers.
if (cell.empty()) {
stats.str_empty_count++;
// All other analysis is irrelevant if we have an empty string.
continue;
}
stats.str_min_length = std::min(stats.str_min_length, cell.size());
stats.str_max_length = std::max(stats.str_max_length, cell.size());
stats.str_min_length_not_empty = std::min(stats.str_min_length_not_empty, cell.size());
string_column_length_sum += cell.size();
string_column_count++;
stats.str_cardinality_estimator_.add(cell);
std::istringstream iss(cell);
std::string token;
while (std::getline(iss, token, ' ')) {
stats.str_vocab_cardinality_estimator_.add(token);
stats.str_num_words++;
}
if (mlio::is_whitespace_only(cell)) {
stats.str_only_whitespace_count++;
// All other analysis is irrelevant if we only have whitespace.
continue;
}
if (match_nan_values(cell, *null_like_values_)) {
stats.str_null_like_count++;
}
// Numeric analyzers
double as_float{};
if (mlio::try_parse_float(cell, as_float) != mlio::Parse_result::ok ||
std::isnan(as_float)) {
stats.numeric_nan_count++;
}
else {
stats.numeric_count++;
if (!std::isnan(as_float) && !std::isinf(as_float)) {
stats.numeric_finite_count++;
numeric_column_sum += as_float;
numeric_column_count++;
if (stats.numeric_column_sample_.size() < max_sample_size) {
stats.numeric_column_sample_.push_back(as_float);
}
if ((std::abs(std::round(as_float) - as_float) <= 1.0e-5)) {
stats.numeric_int_count++;
}
if (std::isnan(stats.numeric_finite_min) ||
as_float < stats.numeric_finite_min) {
stats.numeric_finite_min = as_float;
}
if (std::isnan(stats.numeric_finite_max) ||
as_float > stats.numeric_finite_max) {
stats.numeric_finite_max = as_float;
}
}
}
auto should_capture = capture_columns_->find(feature_idx) != capture_columns_->end();
// Capture the values if specified.
if (should_capture && !stats.str_captured_unique_values_overflowed) {
if (stats.str_captured_unique_values.size() < max_capture_count_) {
++stats.str_captured_unique_values[cell];
}
else if (stats.str_captured_unique_values.find(cell) ==
stats.str_captured_unique_values.end()) {
// If the value isn't present but we're not adding it
// because we're at a limit then we should flag that we
// have overflowed.
stats.str_captured_unique_values_overflowed = true;
}
}
}
// Update the mean of numeric values based on the entire range of values.
auto ncc = static_cast<double>(numeric_column_count);
auto nfc = static_cast<double>(stats.numeric_finite_count);
double numeric_column_mean = numeric_column_sum / ncc;
stats.numeric_finite_mean += (numeric_column_mean - stats.numeric_finite_mean) * ncc / nfc;
// Update average length of string values baseed on entire range of values.
auto scc = static_cast<double>(string_column_count);
auto rows_seen = static_cast<double>(stats.rows_seen);
double string_column_avg_length = static_cast<double>(string_column_length_sum) / scc;
stats.str_avg_length += (string_column_avg_length - stats.str_avg_length) * scc / rows_seen;
}
};