in src/io/bin.cpp [325:520]
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
bool use_missing, bool zero_as_missing,
const std::vector<double>& forced_upper_bounds) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
if (tmp_num_sample_values == num_sample_values) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
na_cnt = num_sample_values - tmp_num_sample_values;
}
}
num_sample_values = tmp_num_sample_values;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::stable_sort(values, values + num_sample_values);
// push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
if (num_sample_values > 0) {
distinct_values.push_back(values[0]);
counts.push_back(1);
}
for (int i = 1; i < num_sample_values; ++i) {
if (!Common::CheckDoubleEqualOrdered(values[i - 1], values[i])) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
// use the large value
distinct_values.back() = values[i];
++counts.back();
}
}
// push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
min_data_in_bin, forced_upper_bounds);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
while (distinct_values[i] > bin_upper_bound_[i_bin] && i_bin < num_bin_ - 1) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
}
CHECK_LE(num_bin_, max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
for (size_t i = 0; i < distinct_values.size(); ++i) {
int val = static_cast<int>(distinct_values[i]);
if (val < 0) {
na_cnt += counts[i];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
if (distinct_values_int.empty() || val != distinct_values_int.back()) {
distinct_values_int.push_back(val);
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
}
int rest_cnt = static_cast<int>(total_sample_cnt - na_cnt);
if (rest_cnt > 0) {
const int SPARSE_RATIO = 100;
if (distinct_values_int.back() / SPARSE_RATIO > static_cast<int>(distinct_values_int.size())) {
Log::Warning("Met categorical feature which contains sparse values. "
"Consider renumbering to consecutive integers started from zero");
}
// sort by counts
Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>(
Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
size_t cur_cat = 0;
categorical_2_bin_.clear();
bin_2_categorical_.clear();
int used_cnt = 0;
int distinct_cnt = static_cast<int>(distinct_values_int.size());
if (na_cnt > 0) {
++distinct_cnt;
}
max_bin = std::min(distinct_cnt, max_bin);
cnt_in_bin.clear();
// Push the dummy bin for NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = 0;
cnt_in_bin.push_back(0);
num_bin_ = 1;
while (cur_cat < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) {
break;
}
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
++num_bin_;
++cur_cat;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
}
// fix count of NaN bin
cnt_in_bin[0] = static_cast<int>(total_sample_cnt - used_cnt);
}
}
// check trivial(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trivial_ = true;
} else {
is_trivial_ = false;
}
// check useless bin
if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trivial_ = true;
}
if (!is_trivial_) {
default_bin_ = ValueToBin(0);
most_freq_bin_ =
static_cast<uint32_t>(ArrayArgs<int>::ArgMax(cnt_in_bin));
const double max_sparse_rate =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
// When most_freq_bin_ != default_bin_, there are some additional data loading costs.
// so use most_freq_bin_ = default_bin_ when there is not so sparse
if (most_freq_bin_ != default_bin_ && max_sparse_rate < kSparseThreshold) {
most_freq_bin_ = default_bin_;
}
sparse_rate_ =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
} else {
sparse_rate_ = 1.0f;
}
}