std::shared_ptr ContextImpl::create()

in tensorpipe/channel/cuda_gdr/context_impl.cc [452:520]


std::shared_ptr<ContextImpl> ContextImpl::create(
    optional<std::vector<std::string>> gpuIdxToNicName) {
  Error error;

  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  // FIXME Instead of throwing away the error and setting a bool, we should have
  // a way to set the context in an error state, and use that for viability.
  if (error) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because libcuda could not be loaded: "
        << error.what();
    return nullptr;
  }

  IbvLib ibvLib;
  std::tie(error, ibvLib) = IbvLib::create();
  // FIXME Instead of throwing away the error and setting a bool, we should have
  // a way to set the context in an error state, and use that for viability.
  if (error) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because libibverbs could not be loaded: "
        << error.what();
    return nullptr;
  }

  if (!isNvidiaPeerMemoryClientActive()) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because the nv_peer_mem kernel module isn't active";
    return nullptr;
  }

  IbvDeviceList deviceList;
  std::tie(error, deviceList) = IbvDeviceList::create(ibvLib);
  if (error && error.isOfType<SystemError>() &&
      error.castToType<SystemError>()->errorCode() == ENOSYS) {
    TP_VLOG(5)
        << "CUDA GDR channel couldn't get list of InfiniBand devices because the kernel module isn't "
        << "loaded";
    return nullptr;
  }
  TP_THROW_ASSERT_IF(error)
      << "Couldn't get list of InfiniBand devices: " << error.what();
  if (deviceList.size() == 0) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because it couldn't find any InfiniBand NICs";
    return nullptr;
  }

  // FIXME In principle we could just exclude the GPUs that violate this check
  // but keep working with the other ones (if any).
  if (!allGpusHaveEnoughBar1Size()) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because some GPUs don't have a large enough PCIe BAR1 size";
    return nullptr;
  }

  std::unordered_map<Device, std::string> deviceDescriptors;
  for (const auto& device : getCudaDevices(cudaLib)) {
    deviceDescriptors[device] = "*";
  }

  return std::make_shared<ContextImpl>(
      std::move(deviceDescriptors),
      std::move(cudaLib),
      std::move(ibvLib),
      std::move(deviceList),
      std::move(gpuIdxToNicName));
}