in src/io/train_share_states.cpp [98:249]
void MultiValBinWrapper::CopyMultiValBinSubset(
const std::vector<int>& group_feature_start,
const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
const std::vector<int8_t>& is_feature_used,
const data_size_t* bagging_use_indices,
data_size_t bagging_indices_cnt) {
double sum_used_dense_ratio = 0.0;
double sum_dense_ratio = 0.0;
int num_used = 0;
int total = 0;
std::vector<int> used_feature_index;
for (int i : feature_groups_contained_) {
int f_start = group_feature_start[i];
if (feature_groups[i]->is_multi_val_) {
for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
const auto dense_rate =
1.0 - feature_groups[i]->bin_mappers_[j]->sparse_rate();
if (is_feature_used[f_start + j]) {
++num_used;
used_feature_index.push_back(total);
sum_used_dense_ratio += dense_rate;
}
sum_dense_ratio += dense_rate;
++total;
}
} else {
bool is_group_used = false;
double dense_rate = 0;
for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
if (is_feature_used[f_start + j]) {
is_group_used = true;
}
dense_rate += 1.0 - feature_groups[i]->bin_mappers_[j]->sparse_rate();
}
if (is_group_used) {
++num_used;
used_feature_index.push_back(total);
sum_used_dense_ratio += dense_rate;
}
sum_dense_ratio += dense_rate;
++total;
}
}
const double k_subfeature_threshold = 0.6;
if (sum_used_dense_ratio >= sum_dense_ratio * k_subfeature_threshold) {
// only need to copy subset
if (is_use_subrow_ && !is_subrow_copied_) {
if (multi_val_bin_subset_ == nullptr) {
multi_val_bin_subset_.reset(multi_val_bin_->CreateLike(
bagging_indices_cnt, multi_val_bin_->num_bin(), total,
multi_val_bin_->num_element_per_row(), multi_val_bin_->offsets()));
} else {
multi_val_bin_subset_->ReSize(
bagging_indices_cnt, multi_val_bin_->num_bin(), total,
multi_val_bin_->num_element_per_row(), multi_val_bin_->offsets());
}
multi_val_bin_subset_->CopySubrow(
multi_val_bin_.get(), bagging_use_indices,
bagging_indices_cnt);
// avoid to copy subset many times
is_subrow_copied_ = true;
}
} else {
is_use_subcol_ = true;
std::vector<uint32_t> upper_bound;
std::vector<uint32_t> lower_bound;
std::vector<uint32_t> delta;
std::vector<uint32_t> offsets;
hist_move_src_.clear();
hist_move_dest_.clear();
hist_move_size_.clear();
const int offset = multi_val_bin_->IsSparse() ? 1 : 0;
int num_total_bin = offset;
int new_num_total_bin = offset;
offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
for (int i : feature_groups_contained_) {
int f_start = group_feature_start[i];
if (feature_groups[i]->is_multi_val_) {
for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
const auto& bin_mapper = feature_groups[i]->bin_mappers_[j];
if (i == 0 && j == 0 && bin_mapper->GetMostFreqBin() > 0) {
num_total_bin = 1;
}
int cur_num_bin = bin_mapper->num_bin();
if (bin_mapper->GetMostFreqBin() == 0) {
cur_num_bin -= offset;
}
num_total_bin += cur_num_bin;
if (is_feature_used[f_start + j]) {
new_num_total_bin += cur_num_bin;
offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin);
hist_move_src_.push_back(
(new_num_total_bin - cur_num_bin) * 2);
hist_move_dest_.push_back((num_total_bin - cur_num_bin) *
2);
hist_move_size_.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin);
}
}
} else {
bool is_group_used = false;
for (int j = 0; j < feature_groups[i]->num_feature_; ++j) {
if (is_feature_used[f_start + j]) {
is_group_used = true;
break;
}
}
int cur_num_bin = feature_groups[i]->bin_offsets_.back() - offset;
num_total_bin += cur_num_bin;
if (is_group_used) {
new_num_total_bin += cur_num_bin;
offsets.push_back(static_cast<uint32_t>(new_num_total_bin));
lower_bound.push_back(num_total_bin - cur_num_bin);
upper_bound.push_back(num_total_bin);
hist_move_src_.push_back(
(new_num_total_bin - cur_num_bin) * 2);
hist_move_dest_.push_back((num_total_bin - cur_num_bin) *
2);
hist_move_size_.push_back(cur_num_bin * 2);
delta.push_back(num_total_bin - new_num_total_bin);
}
}
}
// avoid out of range
lower_bound.push_back(num_total_bin);
upper_bound.push_back(num_total_bin);
data_size_t num_data = is_use_subrow_ ? bagging_indices_cnt : num_data_;
if (multi_val_bin_subset_ == nullptr) {
multi_val_bin_subset_.reset(multi_val_bin_->CreateLike(
num_data, new_num_total_bin, num_used, sum_used_dense_ratio, offsets));
} else {
multi_val_bin_subset_->ReSize(num_data, new_num_total_bin,
num_used, sum_used_dense_ratio, offsets);
}
if (is_use_subrow_) {
multi_val_bin_subset_->CopySubrowAndSubcol(
multi_val_bin_.get(), bagging_use_indices,
bagging_indices_cnt, used_feature_index, lower_bound,
upper_bound, delta);
// may need to recopy subset
is_subrow_copied_ = false;
} else {
multi_val_bin_subset_->CopySubcol(
multi_val_bin_.get(), used_feature_index, lower_bound, upper_bound, delta);
}
}
}