in tensorpipe/channel/cuda_gdr/context_impl.cc [452:520]
std::shared_ptr<ContextImpl> ContextImpl::create(
optional<std::vector<std::string>> gpuIdxToNicName) {
Error error;
CudaLib cudaLib;
std::tie(error, cudaLib) = CudaLib::create();
// FIXME Instead of throwing away the error and setting a bool, we should have
// a way to set the context in an error state, and use that for viability.
if (error) {
TP_VLOG(5)
<< "CUDA GDR channel is not viable because libcuda could not be loaded: "
<< error.what();
return nullptr;
}
IbvLib ibvLib;
std::tie(error, ibvLib) = IbvLib::create();
// FIXME Instead of throwing away the error and setting a bool, we should have
// a way to set the context in an error state, and use that for viability.
if (error) {
TP_VLOG(5)
<< "CUDA GDR channel is not viable because libibverbs could not be loaded: "
<< error.what();
return nullptr;
}
if (!isNvidiaPeerMemoryClientActive()) {
TP_VLOG(5)
<< "CUDA GDR channel is not viable because the nv_peer_mem kernel module isn't active";
return nullptr;
}
IbvDeviceList deviceList;
std::tie(error, deviceList) = IbvDeviceList::create(ibvLib);
if (error && error.isOfType<SystemError>() &&
error.castToType<SystemError>()->errorCode() == ENOSYS) {
TP_VLOG(5)
<< "CUDA GDR channel couldn't get list of InfiniBand devices because the kernel module isn't "
<< "loaded";
return nullptr;
}
TP_THROW_ASSERT_IF(error)
<< "Couldn't get list of InfiniBand devices: " << error.what();
if (deviceList.size() == 0) {
TP_VLOG(5)
<< "CUDA GDR channel is not viable because it couldn't find any InfiniBand NICs";
return nullptr;
}
// FIXME In principle we could just exclude the GPUs that violate this check
// but keep working with the other ones (if any).
if (!allGpusHaveEnoughBar1Size()) {
TP_VLOG(5)
<< "CUDA GDR channel is not viable because some GPUs don't have a large enough PCIe BAR1 size";
return nullptr;
}
std::unordered_map<Device, std::string> deviceDescriptors;
for (const auto& device : getCudaDevices(cudaLib)) {
deviceDescriptors[device] = "*";
}
return std::make_shared<ContextImpl>(
std::move(deviceDescriptors),
std::move(cudaLib),
std::move(ibvLib),
std::move(deviceList),
std::move(gpuIdxToNicName));
}