int nccl_net_ofi_create_plugin()

in src/nccl_ofi_net.cpp [136:357]


int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
{
	int ret = 0;
	const char *provider_filter = NULL;
	nccl_net_ofi_plugin_t *plugin;
	nccl_net_ofi_ep_t *base_ep = NULL;
	nccl_net_ofi_device_t *device = NULL;
	nccl_ofi_properties_t properties;

	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING);

	/* Print Libfabric version */
	uint32_t fab_version = fi_version();
	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using Libfabric version %u.%u", FI_MAJOR(fab_version),
			FI_MINOR(fab_version));

	long int system_page_size_sysconf = sysconf(_SC_PAGESIZE);
	if (OFI_UNLIKELY(system_page_size_sysconf == -1)) {
		NCCL_OFI_WARN("Failed to get system page size (%d %s)", errno, strerror(errno));
		ret = -ENOTSUP;
		goto exit;
	}
	system_page_size = (size_t)system_page_size_sysconf;
	assert(NCCL_OFI_IS_POWER_OF_TWO(system_page_size));
	assert(system_page_size > 0);
	/*
	 * System page size isn't reflective of the GDR mappings. We're not trying to map a
	 * whole page, but just to find an interval that makes an array-based cache manageable.
	 */
	mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);

#if HAVE_CUDA
	ret = nccl_net_ofi_cuda_init();
	if (ret != 0) {
		NCCL_OFI_WARN("CUDA initialization failed.");
		goto exit;
	}
#endif

	/* configuration parameters */
	nic_dup_conns = ofi_nccl_nic_dup_conns();
	net_latency = (float)ofi_nccl_net_latency();
	cq_read_count = ofi_nccl_cq_read_count();

	if (platform_init) {
		ret = platform_init(&provider_filter);
		if (ret != 0)
			goto exit;
	}

	/* This is ugly, but here's the basic protocol selection
	 * logic:
	 *   1. if the user set NCCL_OFI_PROTOCOL, use that.
	 *   2. if the platform init set nccl_ofi_selected_protocol,
	 *      use that.
	 *   3. If the rdma protocol reports multiple nics per device
	 *      and initialized successfully, use that.
	 *   4. If the sendrecv protocol initialized successfully, use
	 *      that
	 *   5. If the rdma protocol initialized successfully, use
	 *      that.
	 */
	if (ofi_nccl_protocol()) {
		nccl_ofi_selected_protocol = ofi_nccl_protocol();
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s (user set)",
			      nccl_ofi_selected_protocol);
	} else if (nccl_ofi_selected_protocol != NULL) {
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s (platform set)",
			      nccl_ofi_selected_protocol);
	}

	if (nccl_ofi_selected_protocol != NULL) {
		bool dummy;

		if (0 == strcasecmp(nccl_ofi_selected_protocol, "SENDRECV")) {
			ret = nccl_net_ofi_sendrecv_init(provider_filter, &plugin);
			if (ret != 0) {
				NCCL_OFI_WARN("Failed to initialize sendrecv protocol");
				goto exit;
			}
		} else if (0 == strcasecmp(nccl_ofi_selected_protocol, "RDMA")) {
			ret = nccl_net_ofi_rdma_init(provider_filter, &plugin, &dummy);
			if (ret != 0) {
				NCCL_OFI_WARN("Failed to initialize rdma protocol");
				goto exit;
			}
		} else {
			NCCL_OFI_WARN("Unable to find plugin protocol %s", nccl_ofi_selected_protocol);
			ret = -ENOTSUP;
			goto exit;
		}
	} else {
		bool have_multiple_rails = false;
		nccl_net_ofi_plugin_t *rdma_plugin = NULL, *sendrecv_plugin = NULL;

		ret = nccl_net_ofi_rdma_init(provider_filter, &rdma_plugin, &have_multiple_rails);
		if (ret != 0) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
				       "Failed to initialize rdma protocol: %s", fi_strerror(-ret));
			have_multiple_rails = false;
			rdma_plugin = NULL;
		}

		if (!have_multiple_rails || rdma_plugin == NULL) {
			ret = nccl_net_ofi_sendrecv_init(provider_filter, &sendrecv_plugin);
			if (ret != 0) {
				NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
					       "Failed to initialized sendrecv protocol: %s", fi_strerror(-ret));
				sendrecv_plugin = NULL;
			}
		}

		if (have_multiple_rails && rdma_plugin != NULL) {
			nccl_ofi_selected_protocol = "RDMA";
			plugin = rdma_plugin;
			if (sendrecv_plugin != NULL) {
				sendrecv_plugin->release_plugin(sendrecv_plugin);
			}
		} else {
			nccl_ofi_selected_protocol = "SENDRECV";
			plugin = sendrecv_plugin;
			if (rdma_plugin != NULL) {
				rdma_plugin->release_plugin(rdma_plugin);
			}
		}

		if (nccl_ofi_selected_protocol == NULL || plugin == NULL) {
			NCCL_OFI_WARN("Unable to find a protocol that worked.  Failing initialization.");
			ret = -EINVAL;
			goto exit;
		}

		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s",
			      nccl_ofi_selected_protocol);
	}

	if (ofi_nccl_domain_per_thread() != -1) {
		plugin->domain_per_thread = (ofi_nccl_domain_per_thread() > 0);
	} else {
		if (platform_default_domain_per_thread) {
			plugin->domain_per_thread = platform_default_domain_per_thread();
		} else {
			plugin->domain_per_thread = false;
		}
	}
	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Creating one domain per %s",
		      plugin->domain_per_thread ? "thread" : "process");

	ret = plugin->complete_init(plugin);
	if (ret != 0) {
		NCCL_OFI_WARN("Failed to initialize %s protocol", nccl_ofi_selected_protocol);
		goto exit;
	}

	/* In order to set endpoint options and potentially NCCL configuration
	 * options (such as NCCL_PROTO) during the plugin initialization
	 * process, we need to create an endpoint and call the platform hook
	 * "platform_config_endpoint" using "get_ep". This code makes the
	 * assumption that the thread calling "nccl_net_ofi_init" will make
	 * communication calls. As well, since without this code the endpoint
	 * would be created the first time "get_ep" in called during a listen or
	 * connect call, creating the endpoint earlier would not be a waste of
	 * resources. This initialization happens once per process, and thus it
	 * does not matter which device is used to create the endpoint.
	 */
	device = plugin->get_device(plugin, 0);

	ret = device->get_ep(device, &base_ep);
	if (ret != 0) {
		goto exit;
	}
	ret = device->get_properties(device, &properties);
	if (ret != 0) {
		goto exit;
	}
	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s",
		      (properties.regIsGlobal == 0) ? "false" : "true");
	NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s",
		      (properties.dmabuf_support == 0) ? "false" : "true");
	ret = base_ep->release_ep(base_ep, false, false);
	if (ret != 0) {
		goto exit;
	}

	assert(support_gdr != GDR_UNKNOWN);

	/* we don't actually know if GDR is supported until we've
	 * created the first endpoint, so this check needs to be way
	 * down here
	 */
	if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) {
		NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA.  This configuration is not supported.");
		ret = -ENOTSUP;
		goto exit;
	}
	/* Force SIMPLE protocol when using a provider that does not support
	 * GDR. NCCL disables the LL128 protocol in this case, but leaves the
	 * LL protocol enabled. Without GDR, the LL protocol polls on host
	 * memory for completion flags. In addition to being slow, this assumes
	 * that host memory is updated in 8 byte segments. However, most
	 * providers that do not support HMEM (like the tcp or sockets
	 * providers) do not make any guarantees about data delivery ordering.
	 * There is not a good way to ask Libfabric providers about their data
	 * delivery support in the general case, so take a conservative
	 * approach and force the simple protocol whenever using a provider
	 * that does not support HMEM.
	 */
	if (support_gdr != GDR_SUPPORTED) {
		ret = nccl_net_ofi_configure_nccl_proto_simple("GDR");
		if (ret != 0) {
			goto exit;
		}
	}

	*plugin_p = plugin;

 exit:
	if (ret != 0) {
		NCCL_OFI_WARN(PACKAGE_NAME " initialization failed");
	}
	return ret;
}