void MultiValBinWrapper::CopyMultiValBinSubset()

in src/io/train_share_states.cpp [98:249]


void MultiValBinWrapper::CopyMultiValBinSubset(
  const std::vector<int>& group_feature_start,
  const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
  const std::vector<int8_t>& is_feature_used,
  const data_size_t* bagging_use_indices,
  data_size_t bagging_indices_cnt) {
  double sum_used_dense_ratio = 0.0;
  double sum_dense_ratio = 0.0;
  int num_used = 0;
  int total = 0;
  std::vector<int> used_feature_index;
  for (int i : feature_groups_contained_) {
    int f_start = group_feature_start[i];
    if (feature_groups[i]->is_multi_val_) {
      for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
        const auto dense_rate =
            1.0 - feature_groups[i]->bin_mappers_[j]->sparse_rate();
        if (is_feature_used[f_start + j]) {
          ++num_used;
          used_feature_index.push_back(total);
          sum_used_dense_ratio += dense_rate;
        }
        sum_dense_ratio += dense_rate;
        ++total;
      }
    } else {
      bool is_group_used = false;
      double dense_rate = 0;
      for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
        if (is_feature_used[f_start + j]) {
          is_group_used = true;
        }
        dense_rate += 1.0 - feature_groups[i]->bin_mappers_[j]->sparse_rate();
      }
      if (is_group_used) {
        ++num_used;
        used_feature_index.push_back(total);
        sum_used_dense_ratio += dense_rate;
      }
      sum_dense_ratio += dense_rate;
      ++total;
    }
  }
  const double k_subfeature_threshold = 0.6;
  if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) {
    // only need to copy subset
    if (is_use_subrow_ && !is_subrow_copied_) {
      if (multi_val_bin_subset_ == nullptr) {
        multi_val_bin_subset_.reset(multi_val_bin_->CreateLike(
            bagging_indices_cnt, multi_val_bin_->num_bin(), total,
            multi_val_bin_->num_element_per_row(), multi_val_bin_->offsets()));
      } else {
        multi_val_bin_subset_->ReSize(
            bagging_indices_cnt, multi_val_bin_->num_bin(), total,
            multi_val_bin_->num_element_per_row(), multi_val_bin_->offsets());
      }
      multi_val_bin_subset_->CopySubrow(
          multi_val_bin_.get(), bagging_use_indices,
          bagging_indices_cnt);
      // avoid to copy subset many times
      is_subrow_copied_ = true;
    }
  } else {
    is_use_subcol_ = true;
    std::vector<uint32_t> upper_bound;
    std::vector<uint32_t> lower_bound;
    std::vector<uint32_t> delta;
    std::vector<uint32_t> offsets;
    hist_move_src_.clear();
    hist_move_dest_.clear();
    hist_move_size_.clear();

    const int offset = multi_val_bin_->IsSparse() ? 1 : 0;
    int num_total_bin = offset;
    int new_num_total_bin = offset;
    offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
    for (int i : feature_groups_contained_) {
      int f_start = group_feature_start[i];
      if (feature_groups[i]->is_multi_val_) {
        for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
          const auto& bin_mapper = feature_groups[i]->bin_mappers_[j];
          if (i == 0 && j == 0 && bin_mapper->GetMostFreqBin() > 0) {
            num_total_bin = 1;
          }
          int cur_num_bin = bin_mapper->num_bin();
          if (bin_mapper->GetMostFreqBin() == 0) {
            cur_num_bin -= offset;
          }
          num_total_bin += cur_num_bin;
          if (is_feature_used[f_start + j]) {
            new_num_total_bin += cur_num_bin;
            offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
            lower_bound.push_back(num_total_bin - cur_num_bin);
            upper_bound.push_back(num_total_bin);

            hist_move_src_.push_back(
                (new_num_total_bin - cur_num_bin) * 2);
            hist_move_dest_.push_back((num_total_bin - cur_num_bin) *
                                                2);
            hist_move_size_.push_back(cur_num_bin * 2);
            delta.push_back(num_total_bin - new_num_total_bin);
          }
        }
      } else {
        bool is_group_used = false;
        for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
          if (is_feature_used[f_start + j]) {
            is_group_used = true;
            break;
          }
        }
        int cur_num_bin = feature_groups[i]->bin_offsets_.back() - offset;
        num_total_bin += cur_num_bin;
        if (is_group_used) {
          new_num_total_bin += cur_num_bin;
          offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
          lower_bound.push_back(num_total_bin - cur_num_bin);
          upper_bound.push_back(num_total_bin);

          hist_move_src_.push_back(
              (new_num_total_bin - cur_num_bin) * 2);
          hist_move_dest_.push_back((num_total_bin - cur_num_bin) *
                                              2);
          hist_move_size_.push_back(cur_num_bin * 2);
          delta.push_back(num_total_bin - new_num_total_bin);
        }
      }
    }
    // avoid out of range
    lower_bound.push_back(num_total_bin);
    upper_bound.push_back(num_total_bin);
    data_size_t num_data = is_use_subrow_ ? bagging_indices_cnt : num_data_;
    if (multi_val_bin_subset_ == nullptr) {
      multi_val_bin_subset_.reset(multi_val_bin_->CreateLike(
          num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets));
    } else {
      multi_val_bin_subset_->ReSize(num_data, new_num_total_bin,
                                              num_used, sum_used_dense_ratio, offsets);
    }
    if (is_use_subrow_) {
      multi_val_bin_subset_->CopySubrowAndSubcol(
          multi_val_bin_.get(), bagging_use_indices,
          bagging_indices_cnt, used_feature_index, lower_bound,
          upper_bound, delta);
      // may need to recopy subset
      is_subrow_copied_ = false;
    } else {
      multi_val_bin_subset_->CopySubcol(
          multi_val_bin_.get(), used_feature_index, lower_bound, upper_bound, delta);
    }
  }
}