int nccl_net_ofi_rdma

int nccl_net_ofi_rdma_init()

in src/nccl_ofi_rdma.cpp [8006:8217]
155 lines of code
30 McCabe index (conditional complexity)

int nccl_net_ofi_rdma_init(const char *provider_filter,
			   nccl_net_ofi_plugin_t **plugin_p,
			   bool *found_multiple_rails)
{
	int ret = 0;
	int num_devs = 0;
	struct fi_info *provider_list = NULL;
	unsigned int num_providers;
	nccl_net_ofi_rdma_plugin_t *plugin = NULL;
	nccl_ofi_topo_t *topo = NULL;
	struct fi_info *hints;
	uint32_t api_version = 0;

	*found_multiple_rails = false;

	if (ofi_nccl_deprecated_rdma_min_posted_bounce_buffers() != -1) {
		NCCL_OFI_WARN("Use of OFI_NCCL_RDMA_MIN_POSTED_BOUNCE_BUFFERS is deprecated.\n"
			      "Please use OFI_NCCL_RDMA_MIN_POSTED_CONTROL_BUFFERS or OFI_NCCL_RDMA_MIN_POSTED_EAGER_BUFFERS.");
		return -EINVAL;
	}
	if (ofi_nccl_deprecated_rdma_max_posted_bounce_buffers() != -1) {
		NCCL_OFI_WARN("Use of OFI_NCCL_RDMA_MAX_POSTED_BOUNCE_BUFFERS is deprecated.\n"
			      "Please use OFI_NCCL_RDMA_MAX_POSTED_CONTROL_BUFFERS or OFI_NCCL_RDMA_MAX_POSTED_EAGER_BUFFERS.");
		return -EINVAL;
	}

	hints = fi_allocinfo();
	if (hints == NULL) {
		NCCL_OFI_WARN("Allocation of fi_info failed");
		ret = -FI_ENOMEM;
		goto error;
	}

	get_hints(hints);
	api_version = nccl_ofi_dmabuf_viable() ? FI_VERSION(1, 20) : FI_VERSION(1, 18);
	ret = nccl_ofi_ofiutils_get_providers(provider_filter, api_version, hints,
					      &provider_list, &num_providers);
	if (ret == 0) {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API, with %s support",
			       FI_MAJOR(api_version),
			       FI_MINOR(api_version),
			       FI_VERSION_GE(api_version, FI_VERSION(1, 20)) ? "DMA-BUF" : "GPUDirect RDMA");
		/* The 1.18 API allows providers to use CUDA to
		 * support HMEM pointers, so just having HMEM doesn't
		 * tell us anything about the usability of CUDA
		 * pointers with NCCL.  So leave the state unknown
		 * until we create an endpoint and try to disable
		 * CUDA
		 */
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
			       "Using Libfabric 1.18 API, with GPUDirect RDMA support");
		support_gdr = GDR_UNKNOWN;
	} else if (ret == -FI_ENODATA) {
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "No eligible providers were found");
		goto error;
	} else {
		NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret));
		goto error;
	}
	fi_freeinfo(hints);

	ret = nccl_net_ofi_query_provider_capabilities(provider_list, num_providers);
	if (ret != 0) {
		NCCL_OFI_WARN("Querying provider capabilities failed: %d", ret);
		goto error;
	}

	if (endpoint_mr) {
		NCCL_OFI_WARN("RDMA protocol does not support endpoint memory registration.");
		ret = -ENOTSUP;
		goto error;
	}

	if ((ssize_t)ofi_nccl_eager_max_size() > (ssize_t)ofi_nccl_min_stripe_size()) {
		NCCL_OFI_WARN("Invalid value for EAGER_MAX_SIZE");
		ret = ncclInvalidArgument;
		goto error;
	}

	/* 
	* NCCL Net v9 API Optimization for LL/LL128 Protocols
	* 
	* Background:
	* When using LL (Low Latency) or LL128 protocols, NCCL sets the request pointer 
	* to NCCL_NET_OPTIONAL_RECV_COMPLETION in irecv() calls. This indicates that 
	* the plugin can complete a receiver request early without plugin explicitly
	* polling the CQ to validate data arrival. This is achievable because NCCL itself
	* following LL protocol semantics will validate data arrival by checking the flag bytes.
	*
	* Plugin Optimization Details:
	* 1. Receiver Side:
	*    - Marks request completion immediately after CTRL message send completion
	*    - Does not wait for RDMA write operation completion
	*
	* 2. Sender Side:
	*    - Uses fi_write instead of fi_writedata, to eliminate unnecessary CQ entries on RX side
	*
	* Requirements:
 	* - Eager msg mode is diabled: eager_max_size == -1
	* - Provider must use FI_PROGRESS_AUTO data progress model
	*/
	if (ofi_nccl_early_completion() < 0) {
		if (!data_progress_auto) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
				       "Early completion disabled due to progress model");
			early_completion = false;
		} else if (ofi_nccl_eager_max_size() >= 0) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
				       "Early completion disabled because eager is enabled");
			early_completion = false;
		} else {
			early_completion = true;
		}
	} else if (ofi_nccl_early_completion() == 0) {
		early_completion = false;
	} else {
		if (!data_progress_auto) {
			NCCL_OFI_WARN("Failed configuration of EARLY_COMPLETION due to provider data progress model is not FI_PROGRESS_AUTO");
			ret = -ENOTSUP;
			goto error;
		}
		early_completion = true;
	}

	if (early_completion && ofi_nccl_eager_max_size() != -1) {
		NCCL_OFI_WARN("Conflicted configuration of EARLY_COMPLETION and EAGER_MAX_SIZE");
		ret = -ENOTSUP;
		goto error;
	}

	/* Create NCCL OFI topology */
	topo = nccl_ofi_topo_create(provider_list);
	if (!topo) {
		NCCL_OFI_WARN("Failed to create NCCL OFI topology");
		ret = -ENOTSUP;
		goto error;
	}

	ret = nccl_ofi_topo_group(topo);
	if (ret != 0) {
		NCCL_OFI_WARN("Failed to group NICs");
		goto error;
	}

	if (topo->max_group_size > MAX_NUM_RAILS) {
		NCCL_OFI_WARN("Unexpected topo group size of %d (maximum %d)",
			      topo->max_group_size, MAX_NUM_RAILS);
		ret = -EINVAL;
		goto error;
	}
	if (topo->max_group_size < 1) {
		NCCL_OFI_WARN("Unexpected group size %d", topo->max_group_size);
		ret = -EINVAL;
		goto error;
	}

	if (topo->max_group_size > 1) {
		*found_multiple_rails = true;
	}

	/**
	 * NCCL's topology detection will set NIC PCIe link speed based on the
	 * "leader" NIC for the GPU. For multi-rail platforms, we increase the
	 * link speed reported to NCCL to account for the other rails. This
	 * requires generating a topology file that will be passed to NCCL.
	 */
	if (topo->max_group_size > 1) {
		ret = write_topo_file(topo);
		if (ret != 0) {
			NCCL_OFI_WARN("Failed to write NCCL topology file");
			goto error;
		}
	}

	ret = nccl_ofi_topo_num_info_lists(topo, &num_devs);
	if (ret != 0) {
		goto error;
	} else if (num_devs <= 0)  {
		NCCL_OFI_WARN("Topology reported unexpected number of devices. "
			      "Expected value larger than zero but got %i",
			      num_devs);
		ret = -EINVAL;;
		goto error;
	}

	ret = nccl_net_ofi_rdma_plugin_create(num_devs, topo, &plugin);
	if (ret != 0) {
		NCCL_OFI_WARN("Unable to allocate nccl_net_ofi_plugin_t");
		goto error;
	}

	cpu_cache_line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
	if (cpu_cache_line_size < 0) {
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
			      "Unable to obtain CPU cache line size from sysconf. "
			      "fallback to predefined value %llu",
			      NCCL_OFI_DEFAULT_CPU_CACHE_LINE_SIZE);
		cpu_cache_line_size = NCCL_OFI_DEFAULT_CPU_CACHE_LINE_SIZE;
	}

	*plugin_p = &plugin->base;

	return ret;

 error:
	if (plugin != NULL) {
		plugin->base.release_plugin(&plugin->base);
		plugin = NULL;
	}

	return ret;
}