in gloo/cuda_allreduce_halving_doubling.cc [605:643]
void CudaAllreduceHalvingDoubling<T, W>::initReductionsAndBroadcasts(
typename std::enable_if<
std::is_same<U, CudaDeviceWorkspace<T>>::value,
typename U::Pointer>::type*) {
if (stepsWithinBlock_ == 0) {
return;
}
if (!devicePtrsForFirstSend_.empty()) {
reduceBeforeFirstSend_ = cudaDeviceReduce(
streams_,
devicePtrsForFirstSend_,
scratchPtrForFirstSend_,
fn_,
0,
sendCounts_[0]);
}
if (!devicePtrsForFirstRecv_.empty()) {
reduceBeforeFirstRecv_ = cudaDeviceReduce(
streams_,
devicePtrsForFirstRecv_,
scratchPtrForFirstRecv_,
fn_,
0,
recvCounts_[0]);
}
for (int i = 0; i < stepsWithinBlock_; i++) {
if (devicePtrsForBroadcast_[i].empty()) {
broadcastOps_.push_back(nullptr);
continue;
}
broadcastOps_.push_back(cudaDeviceBroadcast(
streams_,
devicePtrsForBroadcast_[i],
scratchPtrForBroadcast_[i],
0,
i == stepsWithinBlock_ - 1 ? sendCounts_[i] + recvCounts_[i]
: sendCounts_[i]));
}
}