in src/nccl_ofi_ofiutils.cpp [222:405]
int nccl_ofi_ofiutils_init_connection(struct fi_info *info, struct fid_domain *domain,
struct fid_ep **ep, struct fid_av **av, struct fid_cq **cq)
{
int ret = 0;
struct fi_av_attr av_attr = {};
struct fi_cq_attr cq_attr = {};
/* Create transport level communication endpoint(s) */
ret = fi_endpoint(domain, info, ep, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't allocate endpoint. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
if (*cq == NULL) {
if (info->caps & FI_TAGGED) {
cq_attr.format = FI_CQ_FORMAT_TAGGED;
} else {
cq_attr.format = FI_CQ_FORMAT_DATA;
}
ret = fi_cq_open(domain, &cq_attr, cq, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open CQ. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
}
/* Open AV */
ret = fi_av_open(domain, &av_attr, av, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open AV. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
/* Bind CQ to endpoint */
ret = fi_ep_bind(*ep, &((*cq)->fid), FI_SEND | FI_RECV);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
/* Bind AV to endpoint */
ret = fi_ep_bind(*ep, &((*av)->fid), 0);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't bind EP-AV. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
/*
* Disable shared memory. There's really only three cases
* we're going to be using network operations inside a shared
* memory domain:
*
* 1. disabling NCCL P2P (NVLink / PCIe) operations to test
* networking without lots of nodes.
* 2. flush operations
* 3. cleanup copies for the rdma protocol's eager messages
*
* In none of these do you want to use Libfabric's shared
* memory as opposed to a real network device. (2) is
* actually a correctness issue to use shared memory. So we
* disable shared memory transport when available.
*/
#if HAVE_DECL_FI_OPT_SHARED_MEMORY_PERMITTED
{
bool optval = false;
ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
FI_OPT_SHARED_MEMORY_PERMITTED, &optval,
sizeof(optval));
if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) {
/* One way we get here is running against
* older libfabric builds that don't have
* FI_OPT_SHARED_MEMORY_PERMITTED. This isn't
* awesome, but there isn't really a better
* choice.
*/
NCCL_OFI_TRACE(NCCL_INIT, "Disabling shared memory not supported");
} else if (ret != 0) {
NCCL_OFI_WARN("Disabling shared memory failed: %s",
fi_strerror(-ret));
goto error;
}
}
#endif
/*
* Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if using
* the Libfabric 1.18 API with HMEM support, and the device supports GDR.
*
* Prior to Libfabric 1.18.0, there was no way to disable
* Libfabric from making CUDA calls. While the EFA path was
* CUDA clean, it could use the shm provider, which did make
* CUDA calls. Rather than muck with side channel ways of
* disabling CUDA in old Libfabric, just require newer
* Libfabric.
*/
if (FI_VERSION_GE(info->fabric_attr->api_version,
FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) {
#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
bool optval = false;
ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
FI_OPT_CUDA_API_PERMITTED, &optval,
sizeof(optval));
if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) {
if (support_gdr == GDR_SUPPORTED) {
/* If we got here, that means we previously said
* we definitely had GDR support, but now don't.
* Since we may have already told NCCL that we
* support GDR, we should just abort.
*/
NCCL_OFI_WARN("GDR support reported to NCCL but then couldn't be configured on an endpoint. Cannot continue.");
goto error;
} else {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Could not disable CUDA API usage for HMEM, disabling GDR");
/* If we can't disable CUDA, then we don't really
* have GDR, so disable GDR support from the NCCL
* point of view.
*/
support_gdr = GDR_UNSUPPORTED;
}
} else if (ret == 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_CUDA_API_PERMITTED. GDR Supported");
/* we were able to disable CUDA, so we can do GDR */
support_gdr = GDR_SUPPORTED;
} else {
NCCL_OFI_WARN("Failed to set FI_OPT_CUDA_API_PERMITTED. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
#elif HAVE_NEURON
/*
* Provider discovery for Neuron will have been successful only
* if HMEM capabilities were guaranteed by the libfabric
* provider. Unlike CUDA, we do not need to handle the
* runtime/endpoint deadlock with fi_setopt(), so move the flag
* to supported.
*/
support_gdr = GDR_SUPPORTED;
#else
NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared.");
ret = -EOPNOTSUPP;
goto error;
#endif
}
/* Run platform-specific endpoint configuration hook if declared */
if (platform_config_endpoint) {
ret = platform_config_endpoint(info, *ep);
if (ret != 0)
goto error;
}
/* Enable endpoint for communication */
ret = fi_enable(*ep);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't enable endpoint. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
return ret;
error:
if (*ep) {
fi_close((fid_t)*ep);
*ep = NULL;
}
if (*av) {
fi_close((fid_t)*av);
*av = NULL;
}
if (*cq) {
fi_close((fid_t)*cq);
*cq = NULL;
}
return ret;
}