in src/treelearner/cuda_tree_learner.cpp [767:926]
void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
std::vector<int8_t> is_dense_feature_used(num_features_, 0);
int num_dense_features = 0, num_sparse_features = 0;
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
if (!is_feature_used[feature_index]) continue;
if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
is_sparse_feature_used[feature_index] = 1;
num_sparse_features++;
} else {
is_dense_feature_used[feature_index] = 1;
num_dense_features++;
}
}
// construct smaller leaf
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
// Check workgroups per feature4 tuple..
int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
// if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
if (exp_workgroups_per_feature == 0) {
return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
}
// ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU
bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf());
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
if (num_sparse_features > 0) {
train_data_->ConstructHistograms(is_sparse_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
share_state_.get(),
ptr_smaller_leaf_hist_data);
}
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
if (config_->gpu_use_dp) {
// use double precision
WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
} else {
// use single precision
WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
}
}
// Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
// #define CUDA_DEBUG_COMPARE
#ifdef CUDA_DEBUG_COMPARE
printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
bool compare = true;
for (int i = 0; i < num_dense_feature_groups_; ++i) {
if (!feature_masks_[i])
continue;
int dense_feature_group_index = dense_feature_group_map_[i];
size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
hist_t* gpu_histogram = new hist_t[size * 2];
data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size);
std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
continue;
}
if (num_data == num_data_) {
if (share_state_->is_constant_hessian) {
printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n");
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_,
current_histogram);
} else {
printf("ConstructHistogram(): num_data == num_data_\n");
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_, hessians_,
current_histogram);
}
} else {
if (share_state_->is_constant_hessian) {
printf("ConstructHistogram(): is_constant_hessian\n");
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
0,
num_data,
gradients_,
current_histogram);
} else {
printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
0,
num_data,
gradients_, hessians_,
current_histogram);
}
}
int retval;
if ((num_data != num_data_) && compare) {
retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
printf("CompareHistograms reports %d errors\n", retval);
compare = false;
}
retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
if (num_data == num_data_) {
printf("CompareHistograms reports %d errors\n", retval);
} else {
printf("CompareHistograms reports %d errors\n", retval);
}
std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
delete [] gpu_histogram;
}
printf("End Comparing Histogram between GPU and CPU\n");
fflush(stderr);
fflush(stdout);
#endif
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
if (num_sparse_features > 0) {
train_data_->ConstructHistograms(is_sparse_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
share_state_.get(),
ptr_larger_leaf_hist_data);
}
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
if (config_->gpu_use_dp) {
// use double precision
WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
} else {
// use single precision
WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
}
}
}
}