void BinMapper::FindBin()

in src/io/bin.cpp [325:520]


  void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
                          int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
                          bool use_missing, bool zero_as_missing,
                          const std::vector<double>& forced_upper_bounds) {
    int na_cnt = 0;
    int tmp_num_sample_values = 0;
    for (int i = 0; i < num_sample_values; ++i) {
      if (!std::isnan(values[i])) {
        values[tmp_num_sample_values++] = values[i];
      }
    }
    if (!use_missing) {
      missing_type_ = MissingType::None;
    } else if (zero_as_missing) {
      missing_type_ = MissingType::Zero;
    } else {
      if (tmp_num_sample_values == num_sample_values) {
        missing_type_ = MissingType::None;
      } else {
        missing_type_ = MissingType::NaN;
        na_cnt = num_sample_values - tmp_num_sample_values;
      }
    }
    num_sample_values = tmp_num_sample_values;

    bin_type_ = bin_type;
    default_bin_ = 0;
    int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
    // find distinct_values first
    std::vector<double> distinct_values;
    std::vector<int> counts;

    std::stable_sort(values, values + num_sample_values);

    // push zero in the front
    if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
      distinct_values.push_back(0.0f);
      counts.push_back(zero_cnt);
    }

    if (num_sample_values > 0) {
      distinct_values.push_back(values[0]);
      counts.push_back(1);
    }

    for (int i = 1; i < num_sample_values; ++i) {
      if (!Common::CheckDoubleEqualOrdered(values[i - 1], values[i])) {
        if (values[i - 1] < 0.0f && values[i] > 0.0f) {
          distinct_values.push_back(0.0f);
          counts.push_back(zero_cnt);
        }
        distinct_values.push_back(values[i]);
        counts.push_back(1);
      } else {
        // use the large value
        distinct_values.back() = values[i];
        ++counts.back();
      }
    }

    // push zero in the back
    if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
      distinct_values.push_back(0.0f);
      counts.push_back(zero_cnt);
    }
    min_val_ = distinct_values.front();
    max_val_ = distinct_values.back();
    std::vector<int> cnt_in_bin;
    int num_distinct_values = static_cast<int>(distinct_values.size());
    if (bin_type_ == BinType::NumericalBin) {
      if (missing_type_ == MissingType::Zero) {
        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
                                                   min_data_in_bin, forced_upper_bounds);
        if (bin_upper_bound_.size() == 2) {
          missing_type_ = MissingType::None;
        }
      } else if (missing_type_ == MissingType::None) {
        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
                                                   min_data_in_bin, forced_upper_bounds);
      } else {
        bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
                                                   min_data_in_bin, forced_upper_bounds);
        bin_upper_bound_.push_back(NaN);
      }
      num_bin_ = static_cast<int>(bin_upper_bound_.size());
      {
        cnt_in_bin.resize(num_bin_, 0);
        int i_bin = 0;
        for (int i = 0; i < num_distinct_values; ++i) {
          while (distinct_values[i] > bin_upper_bound_[i_bin] && i_bin < num_bin_ - 1) {
            ++i_bin;
          }
          cnt_in_bin[i_bin] += counts[i];
        }
        if (missing_type_ == MissingType::NaN) {
          cnt_in_bin[num_bin_ - 1] = na_cnt;
        }
      }
      CHECK_LE(num_bin_, max_bin);
    } else {
      // convert to int type first
      std::vector<int> distinct_values_int;
      std::vector<int> counts_int;
      for (size_t i = 0; i < distinct_values.size(); ++i) {
        int val = static_cast<int>(distinct_values[i]);
        if (val < 0) {
          na_cnt += counts[i];
          Log::Warning("Met negative value in categorical features, will convert it to NaN");
        } else {
          if (distinct_values_int.empty() || val != distinct_values_int.back()) {
            distinct_values_int.push_back(val);
            counts_int.push_back(counts[i]);
          } else {
            counts_int.back() += counts[i];
          }
        }
      }
      int rest_cnt = static_cast<int>(total_sample_cnt - na_cnt);
      if (rest_cnt > 0) {
        const int SPARSE_RATIO = 100;
        if (distinct_values_int.back() / SPARSE_RATIO > static_cast<int>(distinct_values_int.size())) {
          Log::Warning("Met categorical feature which contains sparse values. "
                       "Consider renumbering to consecutive integers started from zero");
        }
        // sort by counts
        Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
        // will ignore the categorical of small counts
        int cut_cnt = static_cast<int>(
            Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
        size_t cur_cat = 0;
        categorical_2_bin_.clear();
        bin_2_categorical_.clear();
        int used_cnt = 0;
        int distinct_cnt = static_cast<int>(distinct_values_int.size());
        if (na_cnt > 0) {
          ++distinct_cnt;
        }
        max_bin = std::min(distinct_cnt, max_bin);
        cnt_in_bin.clear();

        // Push the dummy bin for NaN
        bin_2_categorical_.push_back(-1);
        categorical_2_bin_[-1] = 0;
        cnt_in_bin.push_back(0);
        num_bin_ = 1;
        while (cur_cat < distinct_values_int.size()
               && (used_cnt < cut_cnt || num_bin_ < max_bin)) {
          if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) {
            break;
          }
          bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
          categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
          used_cnt += counts_int[cur_cat];
          cnt_in_bin.push_back(counts_int[cur_cat]);
          ++num_bin_;
          ++cur_cat;
        }
        // Use MissingType::None to represent this bin contains all categoricals
        if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
          missing_type_ = MissingType::None;
        } else {
          missing_type_ = MissingType::NaN;
        }
        // fix count of NaN bin
        cnt_in_bin[0] = static_cast<int>(total_sample_cnt - used_cnt);
      }
    }

    // check trivial(num_bin_ == 1) feature
    if (num_bin_ <= 1) {
      is_trivial_ = true;
    } else {
      is_trivial_ = false;
    }
    // check useless bin
    if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
      is_trivial_ = true;
    }

    if (!is_trivial_) {
      default_bin_ = ValueToBin(0);
      most_freq_bin_ =
          static_cast<uint32_t>(ArrayArgs<int>::ArgMax(cnt_in_bin));
      const double max_sparse_rate =
          static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
      // When most_freq_bin_ != default_bin_, there are some additional data loading costs.
      // so use most_freq_bin_  = default_bin_ when there is not so sparse
      if (most_freq_bin_ != default_bin_ && max_sparse_rate < kSparseThreshold) {
        most_freq_bin_ = default_bin_;
      }
      sparse_rate_ =
          static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
    } else {
      sparse_rate_ = 1.0f;
    }
  }