void CUDATreeLearner::ConstructHistograms()

in src/treelearner/cuda_tree_learner.cpp [767:926]


void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
  int num_dense_features = 0, num_sparse_features = 0;

  #pragma omp parallel for schedule(static)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
    if (!is_feature_used[feature_index]) continue;
    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
      is_sparse_feature_used[feature_index] = 1;
      num_sparse_features++;
    } else {
      is_dense_feature_used[feature_index] = 1;
      num_dense_features++;
    }
  }

  // construct smaller leaf
  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;

  // Check workgroups per feature4 tuple..
  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());

  // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
  if (exp_workgroups_per_feature == 0) {
    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
  }

  // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU
  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
    nullptr, smaller_leaf_splits_->num_data_in_leaf());

  // then construct sparse features on CPU
  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
  if (num_sparse_features > 0) {
    train_data_->ConstructHistograms(is_sparse_feature_used,
    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
    gradients_, hessians_,
    ordered_gradients_.data(), ordered_hessians_.data(),
    share_state_.get(),
    ptr_smaller_leaf_hist_data);
  }

  // wait for GPU to finish, only if GPU is actually used
  if (is_gpu_used) {
    if (config_->gpu_use_dp) {
      // use double precision
      WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
    } else {
      // use single precision
      WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
    }
  }

  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
  // #define CUDA_DEBUG_COMPARE
#ifdef CUDA_DEBUG_COMPARE
  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
  bool compare = true;
  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    if (!feature_masks_[i])
      continue;
    int dense_feature_group_index = dense_feature_group_map_[i];
    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
    hist_t* gpu_histogram = new hist_t[size * 2];
    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
    printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size);
    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
    if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
      continue;
    }
    if (num_data == num_data_) {
      if (share_state_->is_constant_hessian) {
        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n");
        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
            0,
            num_data,
            gradients_,
            current_histogram);
      } else {
        printf("ConstructHistogram(): num_data == num_data_\n");
        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
            0,
            num_data,
            gradients_, hessians_,
            current_histogram);
      }
    } else {
      if (share_state_->is_constant_hessian) {
        printf("ConstructHistogram(): is_constant_hessian\n");
        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
            smaller_leaf_splits_->data_indices(),
            0,
            num_data,
            gradients_,
            current_histogram);
      } else {
        printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
            smaller_leaf_splits_->data_indices(),
            0,
            num_data,
            gradients_, hessians_,
            current_histogram);
      }
    }
    int retval;
    if ((num_data != num_data_) && compare) {
        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
        printf("CompareHistograms reports %d errors\n", retval);
        compare = false;
    }
    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
    if (num_data == num_data_) {
        printf("CompareHistograms reports %d errors\n", retval);
    } else {
        printf("CompareHistograms reports %d errors\n", retval);
    }
    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
    delete [] gpu_histogram;
  }
  printf("End Comparing Histogram between GPU and CPU\n");
  fflush(stderr);
  fflush(stdout);
#endif

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;

    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());

    // then construct sparse features on CPU
    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
    if (num_sparse_features > 0) {
    train_data_->ConstructHistograms(is_sparse_feature_used,
      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
      gradients_, hessians_,
      ordered_gradients_.data(), ordered_hessians_.data(),
      share_state_.get(),
      ptr_larger_leaf_hist_data);
    }

    // wait for GPU to finish, only if GPU is actually used
    if (is_gpu_used) {
      if (config_->gpu_use_dp) {
        // use double precision
        WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
      } else {
        // use single precision
        WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
      }
    }
  }
}