in src/treelearner/gpu_tree_learner.cpp [225:537]
void GPUTreeLearner::AllocateGPUMemory() {
num_dense_feature_groups_ = 0;
for (int i = 0; i < num_feature_groups_; ++i) {
if (!train_data_->IsMultiGroup(i)) {
num_dense_feature_groups_++;
}
}
// how many feature-group tuples we have
num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
// leave some safe margin for prefetching
// 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
// clear sparse/dense maps
dense_feature_group_map_.clear();
device_bin_mults_.clear();
sparse_feature_group_map_.clear();
// do nothing if no features can be processed on GPU
if (!num_dense_feature_groups_) {
Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found");
return;
}
// allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
device_features_.reset();
device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>((uint64_t)num_dense_feature4_ * num_data_, ctx_));
// unpin old buffer if necessary before destructing them
if (ptr_pinned_gradients_) {
queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
}
if (ptr_pinned_hessians_) {
queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
}
if (ptr_pinned_feature_masks_) {
queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
}
// make ordered_gradients and Hessians larger (including extra room for prefetching), and pin them
ordered_gradients_.reserve(allocated_num_data_);
ordered_hessians_.reserve(allocated_num_data_);
pinned_gradients_ = boost::compute::buffer(); // deallocate
pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
ordered_gradients_.data());
ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region,
0, allocated_num_data_ * sizeof(score_t));
pinned_hessians_ = boost::compute::buffer(); // deallocate
pinned_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
ordered_hessians_.data());
ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region,
0, allocated_num_data_ * sizeof(score_t));
// allocate space for gradients and Hessians on device
// we will copy gradients and Hessians in after ordered_gradients_ and ordered_hessians_ are constructed
device_gradients_ = boost::compute::buffer(); // deallocate
device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_only, nullptr);
device_hessians_ = boost::compute::buffer(); // deallocate
device_hessians_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t),
boost::compute::memory_object::read_only, nullptr);
// allocate feature mask, for disabling some feature-groups' histogram calculation
feature_masks_.resize(num_dense_feature4_ * dword_features_);
device_feature_masks_ = boost::compute::buffer(); // deallocate
device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
boost::compute::memory_object::read_only, nullptr);
pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_,
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
feature_masks_.data());
ptr_pinned_feature_masks_ = queue_.enqueue_map_buffer(pinned_feature_masks_, boost::compute::command_queue::map_write_invalidate_region,
0, num_dense_feature4_ * dword_features_);
memset(ptr_pinned_feature_masks_, 0, num_dense_feature4_ * dword_features_);
// copy indices to the device
device_data_indices_.reset();
device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
// histogram bin entry size depends on the precision (single/double)
hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
// create output buffer, each feature has a histogram with device_bin_size_ bins,
// each work group generates a sub-histogram of dword_features_ features.
if (!device_subhistograms_) {
// only initialize once here, as this will not need to change when ResetTrainingData() is called
device_subhistograms_ = std::unique_ptr<boost::compute::vector<char>>(new boost::compute::vector<char>(
preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
}
// create atomic counters for inter-group coordination
sync_counters_.reset();
sync_counters_ = std::unique_ptr<boost::compute::vector<int>>(new boost::compute::vector<int>(
num_dense_feature4_, ctx_));
boost::compute::fill(sync_counters_->begin(), sync_counters_->end(), 0, queue_);
// The output buffer is allocated to host directly, to overlap compute and data transfer
device_histogram_outputs_ = boost::compute::buffer(); // deallocate
device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
// find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
int k = 0, copied_feature4 = 0;
std::vector<int> dense_dword_ind(dword_features_);
for (int i = 0; i < num_feature_groups_; ++i) {
// looking for dword_features_ non-sparse feature-groups
if (!train_data_->IsMultiGroup(i)) {
dense_dword_ind[k] = i;
// decide if we need to redistribute the bin
double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
// multiplier must be a power of 2
device_bin_mults_.push_back(static_cast<int>(round(pow(2, floor(log2(t))))));
// device_bin_mults_.push_back(1);
#if GPU_DEBUG >= 1
printf("feature-group %d using multiplier %d\n", i, device_bin_mults_.back());
#endif
k++;
} else {
sparse_feature_group_map_.push_back(i);
}
// found
if (k == dword_features_) {
k = 0;
for (int j = 0; j < dword_features_; ++j) {
dense_feature_group_map_.push_back(dense_dword_ind[j]);
}
copied_feature4++;
}
}
// for data transfer time
auto start_time = std::chrono::steady_clock::now();
// Now generate new data structure feature4, and copy data to the device
int nthreads = std::min(omp_get_max_threads(), static_cast<int>(dense_feature_group_map_.size()) / dword_features_);
nthreads = std::max(nthreads, 1);
std::vector<Feature4*> host4_vecs(nthreads);
std::vector<boost::compute::buffer> host4_bufs(nthreads);
std::vector<Feature4*> host4_ptrs(nthreads);
// preallocate arrays for all threads, and pin them
for (int i = 0; i < nthreads; ++i) {
host4_vecs[i] = reinterpret_cast<Feature4*>(boost::alignment::aligned_alloc(4096, num_data_ * sizeof(Feature4)));
host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4),
boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr,
host4_vecs[i]);
host4_ptrs[i] = reinterpret_cast<Feature4*>(queue_.enqueue_map_buffer(host4_bufs[i], boost::compute::command_queue::map_write_invalidate_region,
0, num_data_ * sizeof(Feature4)));
}
// building Feature4 bundles; each thread handles dword_features_ features
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
int tid = omp_get_thread_num();
Feature4* host4 = host4_ptrs[tid];
auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
auto dev_bin_mult = device_bin_mults_.begin() + i * dword_features_;
#if GPU_DEBUG >= 1
printf("Copying feature group ");
for (int l = 0; l < dword_features_; ++l) {
printf("%d ", dense_ind[l]);
}
printf("to devices\n");
#endif
if (dword_features_ == 8) {
// one feature datapoint is 4 bits
BinIterator* bin_iters[8];
for (int s_idx = 0; s_idx < 8; ++s_idx) {
bin_iters[s_idx] = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[s_idx]) == 0) {
Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_ind[s_idx]);
}
}
// this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
DenseBinIterator<uint8_t, true> iters[8] = {
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[0]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[1]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[2]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[3]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[4]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
for (int j = 0; j < num_data_; ++j) {
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
|((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
|((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
|((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
}
} else if (dword_features_ == 4) {
// one feature datapoint is one byte
for (int s_idx = 0; s_idx < 4; ++s_idx) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
// this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
// Dense bin
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
// Dense 4-bit bin
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
} else {
Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
}
}
} else {
Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
}
#pragma omp critical
queue_.enqueue_write_buffer(device_features_->get_buffer(),
(uint64_t)i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
#if GPU_DEBUG >= 1
printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s[0], host4[0].s[1], host4[0].s[2], host4[0].s[3]);
printf("Feature-groups copied to device with multipliers ");
for (int l = 0; l < dword_features_; ++l) {
printf("%d ", dev_bin_mult[l]);
}
printf("\n");
#endif
}
// working on the remaining (less than dword_features_) feature groups
if (k != 0) {
Feature4* host4 = host4_ptrs[0];
if (dword_features_ == 8) {
memset(host4, 0, num_data_ * sizeof(Feature4));
}
#if GPU_DEBUG >= 1
printf("%d features left\n", k);
#endif
for (int i = 0; i < k; ++i) {
if (dword_features_ == 8) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
<< ((i & 1) << 2));
}
} else {
Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_dword_ind[i]);
}
} else if (dword_features_ == 4) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
} else {
Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported");
}
} else {
Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8");
}
}
// fill the leftover features
if (dword_features_ == 8) {
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
}
}
} else if (dword_features_ == 4) {
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
host4[j].s[i] = (uint8_t)j;
}
}
}
// copying the last 1 to (dword_features - 1) feature-groups in the last tuple
queue_.enqueue_write_buffer(device_features_->get_buffer(),
(num_dense_feature4_ - 1) * (uint64_t)num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
#if GPU_DEBUG >= 1
printf("Last features copied to device\n");
#endif
for (int i = 0; i < k; ++i) {
dense_feature_group_map_.push_back(dense_dword_ind[i]);
}
}
// deallocate pinned space for feature copying
for (int i = 0; i < nthreads; ++i) {
queue_.enqueue_unmap_buffer(host4_bufs[i], host4_ptrs[i]);
host4_bufs[i] = boost::compute::buffer();
boost::alignment::aligned_free(host4_vecs[i]);
}
// data transfer time
std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
Log::Info("%d dense feature groups (%.2f MB) transferred to GPU in %f secs. %d sparse feature groups",
dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0),
end_time * 1e-3, sparse_feature_group_map_.size());
#if GPU_DEBUG >= 1
printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
for (int i = 0; i < num_dense_feature_groups_; ++i) {
printf("%d ", dense_feature_group_map_[i]);
}
printf("\n");
printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
for (int i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
printf("%d ", sparse_feature_group_map_[i]);
}
printf("\n");
#endif
}