static ncclResult_t ofi_init()

in src/nccl_ofi_net.c [984:1080]


static ncclResult_t ofi_init(ncclDebugLogger_t logFunction)
{
	ncclResult_t ret = ncclSuccess;
	char *prov_include = NULL;
	int idx, rc;

	ofi_log_function = logFunction;

	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using " PACKAGE_STRING);

	/*
	 * RDMAV_FORK_SAFE environment variable makes the rdma-core
	 * library fork-safe. This significantly increases cost of memory
	 * registration when huge pages are enabled.
	 *
	 * To prevent data corruption, the EFA provider registers an atfork
	 * handler which will abort the process whenever it believes
	 * rdma-core is not fork-safe.
	 *
	 * NCCL applications heavily re-use the buffers for communication and
	 * thus are not sensitive to increased memory registration costs.
	 * To prevent NCCL based applications from getting aborted when using
	 * fork(), plugin explicitly enables RDMAV_FORK_SAFE environment
	 * variable.
	 */
	if (!getenv("RDMAV_FORK_SAFE")) {
		NCCL_OFI_INFO(NCCL_INIT, "Setting RDMAV_FORK_SAFE environment variable to 1.");
		rc = setenv("RDMAV_FORK_SAFE", "1", 1);
		if (rc != 0) {
			NCCL_OFI_WARN("Unable to set RDMAV_FORK_SAFE");
			ret = ncclSystemError;
			goto exit;
		}
	}

	/* Get list of NICs fi_info structures for a single provider */
	ret = get_ofi_provider(prov_include, &ofi_info_list);
	if (ret != 0 || ofi_info_list == NULL) {
		ret = ncclSystemError;
		goto exit;
	}

	/* If TCP provider is selected, filter out unnecessary interfaces and address formats */
	if (strncmp("tcp", ofi_info_list->fabric_attr->prov_name, strlen("tcp")) == 0) {
		filter_tcp_info_list();
		if (OFI_UNLIKELY(ofi_info_list == NULL)) {
			NCCL_OFI_WARN("No viable endpoint found for TCP provider. Try and relax the filters using OFI_NCCL_USE_IPV6_TCP or OFI_NCCL_EXCLUDE_TCP_IF environment variables");
			ret = ncclSystemError;
			goto exit;
		}
	}

	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Selected Provider is %s",
		      ofi_info_list->fabric_attr->prov_name);

	/* Check if provider requires local memory registration */
	if (ofi_info_list->domain_attr->mr_mode & FI_MR_LOCAL) {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s requires registration of local memory buffers",
			       ofi_info_list->fabric_attr->prov_name);
		local_mr = true;
	} else {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s does not require registration of local memory buffers",
			       ofi_info_list->fabric_attr->prov_name);
	}

	/* Check if provider requires heterogeneous memory registration */
	if (ofi_info_list->domain_attr->mr_mode & FI_MR_HMEM) {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s requires registration of device buffers",
			       ofi_info_list->fabric_attr->prov_name);
		hmem_mr = true;
	} else {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s does not require registration of device buffers",
			       ofi_info_list->fabric_attr->prov_name);
	}

	/*
	 * Allocate NCCL OFI component array. Individual components are
	 * allocated as we use them.
	 */
	nccl_ofi_component =
		(nccl_ofi_t **)malloc(sizeof(nccl_ofi_t *) * ofi_ndevices);
	if (OFI_UNLIKELY(nccl_ofi_component == NULL)) {
		NCCL_OFI_WARN("Unable to allocate nccl_ofi_component");
		ret = ncclSystemError;
		goto exit;
	}

	for (idx = 0; idx < ofi_ndevices; idx++) {
		nccl_ofi_component[idx] = NULL;
	}

exit:
	if (ret != ncclSuccess) {
		NCCL_OFI_WARN(PACKAGE_NAME " initialization failed");
	}
	return ret;
}