int nccl_ofi_ofiutils_init_connection()

in src/nccl_ofi_ofiutils.cpp [222:405]


int nccl_ofi_ofiutils_init_connection(struct fi_info *info, struct fid_domain *domain,
				      struct fid_ep **ep, struct fid_av **av, struct fid_cq **cq)
{
	int ret = 0;
	struct fi_av_attr av_attr = {};
	struct fi_cq_attr cq_attr = {};

	/* Create transport level communication endpoint(s) */
	ret = fi_endpoint(domain, info, ep, NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't allocate endpoint. RC: %d, ERROR: %s",
			      ret, fi_strerror(-ret));
		goto error;
	}

	if (*cq == NULL) {
		if (info->caps & FI_TAGGED) {
			cq_attr.format = FI_CQ_FORMAT_TAGGED;
		} else {
			cq_attr.format = FI_CQ_FORMAT_DATA;
		}

		ret = fi_cq_open(domain, &cq_attr, cq, NULL);
		if (OFI_UNLIKELY(ret != 0)) {
			NCCL_OFI_WARN("Couldn't open CQ. RC: %d, ERROR: %s",
				      ret, fi_strerror(-ret));
			goto error;
		}
	}

	/* Open AV */
	ret = fi_av_open(domain, &av_attr, av, NULL);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't open AV. RC: %d, ERROR: %s",
			      ret, fi_strerror(-ret));
		goto error;
	}

	/* Bind CQ to endpoint */
	ret = fi_ep_bind(*ep, &((*cq)->fid), FI_SEND | FI_RECV);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't bind EP-CQ. RC: %d, ERROR: %s",
			      ret, fi_strerror(-ret));
		goto error;
	}

	/* Bind AV to endpoint */
	ret = fi_ep_bind(*ep, &((*av)->fid), 0);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't bind EP-AV. RC: %d, ERROR: %s",
			      ret, fi_strerror(-ret));
		goto error;
	}

	/*
	 * Disable shared memory.  There's really only three cases
	 * we're going to be using network operations inside a shared
	 * memory domain:
	 *
	 * 1. disabling NCCL P2P (NVLink / PCIe) operations to test
	 *    networking without lots of nodes.
	 * 2. flush operations
	 * 3. cleanup copies for the rdma protocol's eager messages
	 *
	 * In none of these do you want to use Libfabric's shared
	 * memory as opposed to a real network device.  (2) is
	 * actually a correctness issue to use shared memory.  So we
	 * disable shared memory transport when available.
	 */
#if HAVE_DECL_FI_OPT_SHARED_MEMORY_PERMITTED
	{
		bool optval = false;
		ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
				FI_OPT_SHARED_MEMORY_PERMITTED, &optval,
				sizeof(optval));
		if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) {
			/* One way we get here is running against
			 * older libfabric builds that don't have
			 * FI_OPT_SHARED_MEMORY_PERMITTED.  This isn't
			 * awesome, but there isn't really a better
			 * choice.
			 */
			NCCL_OFI_TRACE(NCCL_INIT, "Disabling shared memory not supported");
		} else if (ret != 0) {
			NCCL_OFI_WARN("Disabling shared memory failed: %s",
				      fi_strerror(-ret));
			goto error;
		}
	}
#endif

	/*
	 * Set Libfabric endpoint option FI_OPT_CUDA_API_PERMITTED to false if using
	 * the Libfabric 1.18 API with HMEM support, and the device supports GDR.
	 *
	 * Prior to Libfabric 1.18.0, there was no way to disable
	 * Libfabric from making CUDA calls.  While the EFA path was
	 * CUDA clean, it could use the shm provider, which did make
	 * CUDA calls.  Rather than muck with side channel ways of
	 * disabling CUDA in old Libfabric, just require newer
	 * Libfabric.
	 */
	if (FI_VERSION_GE(info->fabric_attr->api_version,
			  FI_VERSION(1, 18)) && support_gdr != GDR_UNSUPPORTED) {
#if (HAVE_CUDA && HAVE_DECL_FI_OPT_CUDA_API_PERMITTED)
		bool optval = false;
		ret = fi_setopt(&(*ep)->fid, FI_OPT_ENDPOINT,
				FI_OPT_CUDA_API_PERMITTED, &optval,
				sizeof(optval));
		if (ret == -FI_EOPNOTSUPP || ret == -FI_ENOPROTOOPT) {
			if (support_gdr == GDR_SUPPORTED) {
				/* If we got here, that means we previously said
				 * we definitely had GDR support, but now don't.
				 * Since we may have already told NCCL that we
				 * support GDR, we should just abort.
				 */
				NCCL_OFI_WARN("GDR support reported to NCCL but then couldn't be configured on an endpoint.  Cannot continue.");
				goto error;
			} else {
				NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Could not disable CUDA API usage for HMEM, disabling GDR");
				/* If we can't disable CUDA, then we don't really
				 * have GDR, so disable GDR  support from the NCCL
				 * point of view.
				 */
				support_gdr = GDR_UNSUPPORTED;
			}
		} else if (ret == 0) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Set endpoint option FI_OPT_CUDA_API_PERMITTED. GDR Supported");
			/* we were able to disable CUDA, so we can do GDR */
			support_gdr = GDR_SUPPORTED;
		} else {
			NCCL_OFI_WARN("Failed to set FI_OPT_CUDA_API_PERMITTED. RC: %d, ERROR: %s",
				      ret, fi_strerror(-ret));
			goto error;
		}
#elif HAVE_NEURON
		/*
		 * Provider discovery for Neuron will have been successful only
		 * if HMEM capabilities were guaranteed by the libfabric
		 * provider. Unlike CUDA, we do not need to handle the
		 * runtime/endpoint deadlock with fi_setopt(), so move the flag
		 * to supported.
		 */
		support_gdr = GDR_SUPPORTED;
#else
		NCCL_OFI_WARN("Using Libfabric 1.18 API with GPUDirect RDMA support, and FI_OPT_CUDA_API_PERMITTED is not declared.");
		ret = -EOPNOTSUPP;
		goto error;
#endif
	}
	/* Run platform-specific endpoint configuration hook if declared */
	if (platform_config_endpoint) {
		ret = platform_config_endpoint(info, *ep);
		if (ret != 0)
			goto error;
	}

	/* Enable endpoint for communication */
	ret = fi_enable(*ep);
	if (OFI_UNLIKELY(ret != 0)) {
		NCCL_OFI_WARN("Couldn't enable endpoint. RC: %d, ERROR: %s",
			      ret, fi_strerror(-ret));
		goto error;
	}

	return ret;
 error:
	if (*ep) {
		fi_close((fid_t)*ep);
		*ep = NULL;
	}

	if (*av) {
		fi_close((fid_t)*av);
		*av = NULL;
	}

	if (*cq) {
		fi_close((fid_t)*cq);
		*cq = NULL;
	}

	return ret;
}