in fairring/utils.h [143:176]
inline std::vector<NcclComm> createManyNcclComms(
int rankStart,
const std::vector<c10::Device>& devices,
int worldSize,
ncclUniqueId uniqueId) {
std::vector<ncclComm_t> rawComms(devices.size());
NCCL_CHECK(ncclGroupStart());
for (const auto deviceOffset : c10::irange(devices.size())) {
c10::cuda::CUDAGuard g(devices[deviceOffset]);
// std::ostringstream oss;
// oss << "Initing NCCL on rank " << rankStart + deviceOffset << "/" <<
// worldSize << " with unique ID "; for (int64_t offset = 0; offset <
// sizeof(ncclUniqueId); offset += 1) {
// oss << std::hex << std::setw(2) << std::setfill('0') <<
// static_cast<uint64_t>(*(reinterpret_cast<uint8_t*>(&uniqueId) +
// offset));
// }
// oss << std::endl;
// std::cerr << oss.str();
NCCL_CHECK(ncclCommInitRank(
&rawComms[deviceOffset],
worldSize,
uniqueId,
rankStart + deviceOffset));
}
NCCL_CHECK(ncclGroupEnd());
std::vector<NcclComm> comms;
comms.reserve(devices.size());
for (const auto deviceOffset : c10::irange(devices.size())) {
comms.push_back(NcclComm(rawComms[deviceOffset], NcclCommDeleter{}));
}
return comms;
}