in benchmark/benchmark.cc [144:161]
void runOneEpoch() {
c10::cuda::CUDAStream stream =
c10::cuda::getStreamFromPool(/*isHighPriority=*/true, /*device=*/0);
c10::cuda::CUDAStreamGuard g(stream);
std::vector<c10::intrusive_ptr<c10::ivalue::Future>> futures;
for (size_t bucketIdx = 0; bucketIdx < numBuckets_; bucketIdx += 1) {
std::vector<at::Tensor> data = {buckets_[bucketIdx]};
futures.push_back(pg_->allreduce(data)->getFuture());
}
for (const auto& future : futures) {
// future->wait(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::minutes(1)));
future->wait();
}
stream.synchronize();
}