in src/nccl_ofi_net.cpp [136:357]
int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
{
int ret = 0;
const char *provider_filter = NULL;
nccl_net_ofi_plugin_t *plugin;
nccl_net_ofi_ep_t *base_ep = NULL;
nccl_net_ofi_device_t *device = NULL;
nccl_ofi_properties_t properties;
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Initializing " PACKAGE_STRING);
/* Print Libfabric version */
uint32_t fab_version = fi_version();
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using Libfabric version %u.%u", FI_MAJOR(fab_version),
FI_MINOR(fab_version));
long int system_page_size_sysconf = sysconf(_SC_PAGESIZE);
if (OFI_UNLIKELY(system_page_size_sysconf == -1)) {
NCCL_OFI_WARN("Failed to get system page size (%d %s)", errno, strerror(errno));
ret = -ENOTSUP;
goto exit;
}
system_page_size = (size_t)system_page_size_sysconf;
assert(NCCL_OFI_IS_POWER_OF_TWO(system_page_size));
assert(system_page_size > 0);
/*
* System page size isn't reflective of the GDR mappings. We're not trying to map a
* whole page, but just to find an interval that makes an array-based cache manageable.
*/
mr_cache_alignment = std::min(system_page_size, NCCL_OFI_CACHE_PAGE_SIZE);
#if HAVE_CUDA
ret = nccl_net_ofi_cuda_init();
if (ret != 0) {
NCCL_OFI_WARN("CUDA initialization failed.");
goto exit;
}
#endif
/* configuration parameters */
nic_dup_conns = ofi_nccl_nic_dup_conns();
net_latency = (float)ofi_nccl_net_latency();
cq_read_count = ofi_nccl_cq_read_count();
if (platform_init) {
ret = platform_init(&provider_filter);
if (ret != 0)
goto exit;
}
/* This is ugly, but here's the basic protocol selection
* logic:
* 1. if the user set NCCL_OFI_PROTOCOL, use that.
* 2. if the platform init set nccl_ofi_selected_protocol,
* use that.
* 3. If the rdma protocol reports multiple nics per device
* and initialized successfully, use that.
* 4. If the sendrecv protocol initialized successfully, use
* that
* 5. If the rdma protocol initialized successfully, use
* that.
*/
if (ofi_nccl_protocol()) {
nccl_ofi_selected_protocol = ofi_nccl_protocol();
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s (user set)",
nccl_ofi_selected_protocol);
} else if (nccl_ofi_selected_protocol != NULL) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s (platform set)",
nccl_ofi_selected_protocol);
}
if (nccl_ofi_selected_protocol != NULL) {
bool dummy;
if (0 == strcasecmp(nccl_ofi_selected_protocol, "SENDRECV")) {
ret = nccl_net_ofi_sendrecv_init(provider_filter, &plugin);
if (ret != 0) {
NCCL_OFI_WARN("Failed to initialize sendrecv protocol");
goto exit;
}
} else if (0 == strcasecmp(nccl_ofi_selected_protocol, "RDMA")) {
ret = nccl_net_ofi_rdma_init(provider_filter, &plugin, &dummy);
if (ret != 0) {
NCCL_OFI_WARN("Failed to initialize rdma protocol");
goto exit;
}
} else {
NCCL_OFI_WARN("Unable to find plugin protocol %s", nccl_ofi_selected_protocol);
ret = -ENOTSUP;
goto exit;
}
} else {
bool have_multiple_rails = false;
nccl_net_ofi_plugin_t *rdma_plugin = NULL, *sendrecv_plugin = NULL;
ret = nccl_net_ofi_rdma_init(provider_filter, &rdma_plugin, &have_multiple_rails);
if (ret != 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Failed to initialize rdma protocol: %s", fi_strerror(-ret));
have_multiple_rails = false;
rdma_plugin = NULL;
}
if (!have_multiple_rails || rdma_plugin == NULL) {
ret = nccl_net_ofi_sendrecv_init(provider_filter, &sendrecv_plugin);
if (ret != 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Failed to initialized sendrecv protocol: %s", fi_strerror(-ret));
sendrecv_plugin = NULL;
}
}
if (have_multiple_rails && rdma_plugin != NULL) {
nccl_ofi_selected_protocol = "RDMA";
plugin = rdma_plugin;
if (sendrecv_plugin != NULL) {
sendrecv_plugin->release_plugin(sendrecv_plugin);
}
} else {
nccl_ofi_selected_protocol = "SENDRECV";
plugin = sendrecv_plugin;
if (rdma_plugin != NULL) {
rdma_plugin->release_plugin(rdma_plugin);
}
}
if (nccl_ofi_selected_protocol == NULL || plugin == NULL) {
NCCL_OFI_WARN("Unable to find a protocol that worked. Failing initialization.");
ret = -EINVAL;
goto exit;
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using transport protocol %s",
nccl_ofi_selected_protocol);
}
if (ofi_nccl_domain_per_thread() != -1) {
plugin->domain_per_thread = (ofi_nccl_domain_per_thread() > 0);
} else {
if (platform_default_domain_per_thread) {
plugin->domain_per_thread = platform_default_domain_per_thread();
} else {
plugin->domain_per_thread = false;
}
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Creating one domain per %s",
plugin->domain_per_thread ? "thread" : "process");
ret = plugin->complete_init(plugin);
if (ret != 0) {
NCCL_OFI_WARN("Failed to initialize %s protocol", nccl_ofi_selected_protocol);
goto exit;
}
/* In order to set endpoint options and potentially NCCL configuration
* options (such as NCCL_PROTO) during the plugin initialization
* process, we need to create an endpoint and call the platform hook
* "platform_config_endpoint" using "get_ep". This code makes the
* assumption that the thread calling "nccl_net_ofi_init" will make
* communication calls. As well, since without this code the endpoint
* would be created the first time "get_ep" in called during a listen or
* connect call, creating the endpoint earlier would not be a waste of
* resources. This initialization happens once per process, and thus it
* does not matter which device is used to create the endpoint.
*/
device = plugin->get_device(plugin, 0);
ret = device->get_ep(device, &base_ep);
if (ret != 0) {
goto exit;
}
ret = device->get_properties(device, &properties);
if (ret != 0) {
goto exit;
}
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for global registrations: %s",
(properties.regIsGlobal == 0) ? "false" : "true");
NCCL_OFI_INFO(NCCL_NET | NCCL_INIT, "Support for DMA-BUF registrations: %s",
(properties.dmabuf_support == 0) ? "false" : "true");
ret = base_ep->release_ep(base_ep, false, false);
if (ret != 0) {
goto exit;
}
assert(support_gdr != GDR_UNKNOWN);
/* we don't actually know if GDR is supported until we've
* created the first endpoint, so this check needs to be way
* down here
*/
if (nic_dup_conns > 0 && support_gdr != GDR_UNSUPPORTED) {
NCCL_OFI_WARN("NCCL_OFI_NIC_DUP_CONNS set on platform that supports GPUDirect RDMA. This configuration is not supported.");
ret = -ENOTSUP;
goto exit;
}
/* Force SIMPLE protocol when using a provider that does not support
* GDR. NCCL disables the LL128 protocol in this case, but leaves the
* LL protocol enabled. Without GDR, the LL protocol polls on host
* memory for completion flags. In addition to being slow, this assumes
* that host memory is updated in 8 byte segments. However, most
* providers that do not support HMEM (like the tcp or sockets
* providers) do not make any guarantees about data delivery ordering.
* There is not a good way to ask Libfabric providers about their data
* delivery support in the general case, so take a conservative
* approach and force the simple protocol whenever using a provider
* that does not support HMEM.
*/
if (support_gdr != GDR_SUPPORTED) {
ret = nccl_net_ofi_configure_nccl_proto_simple("GDR");
if (ret != 0) {
goto exit;
}
}
*plugin_p = plugin;
exit:
if (ret != 0) {
NCCL_OFI_WARN(PACKAGE_NAME " initialization failed");
}
return ret;
}