in src/nccl_ofi_rdma.cpp [8006:8217]
int nccl_net_ofi_rdma_init(const char *provider_filter,
nccl_net_ofi_plugin_t **plugin_p,
bool *found_multiple_rails)
{
int ret = 0;
int num_devs = 0;
struct fi_info *provider_list = NULL;
unsigned int num_providers;
nccl_net_ofi_rdma_plugin_t *plugin = NULL;
nccl_ofi_topo_t *topo = NULL;
struct fi_info *hints;
uint32_t api_version = 0;
*found_multiple_rails = false;
if (ofi_nccl_deprecated_rdma_min_posted_bounce_buffers() != -1) {
NCCL_OFI_WARN("Use of OFI_NCCL_RDMA_MIN_POSTED_BOUNCE_BUFFERS is deprecated.\n"
"Please use OFI_NCCL_RDMA_MIN_POSTED_CONTROL_BUFFERS or OFI_NCCL_RDMA_MIN_POSTED_EAGER_BUFFERS.");
return -EINVAL;
}
if (ofi_nccl_deprecated_rdma_max_posted_bounce_buffers() != -1) {
NCCL_OFI_WARN("Use of OFI_NCCL_RDMA_MAX_POSTED_BOUNCE_BUFFERS is deprecated.\n"
"Please use OFI_NCCL_RDMA_MAX_POSTED_CONTROL_BUFFERS or OFI_NCCL_RDMA_MAX_POSTED_EAGER_BUFFERS.");
return -EINVAL;
}
hints = fi_allocinfo();
if (hints == NULL) {
NCCL_OFI_WARN("Allocation of fi_info failed");
ret = -FI_ENOMEM;
goto error;
}
get_hints(hints);
api_version = nccl_ofi_dmabuf_viable() ? FI_VERSION(1, 20) : FI_VERSION(1, 18);
ret = nccl_ofi_ofiutils_get_providers(provider_filter, api_version, hints,
&provider_list, &num_providers);
if (ret == 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Using Libfabric %u.%u API, with %s support",
FI_MAJOR(api_version),
FI_MINOR(api_version),
FI_VERSION_GE(api_version, FI_VERSION(1, 20)) ? "DMA-BUF" : "GPUDirect RDMA");
/* The 1.18 API allows providers to use CUDA to
* support HMEM pointers, so just having HMEM doesn't
* tell us anything about the usability of CUDA
* pointers with NCCL. So leave the state unknown
* until we create an endpoint and try to disable
* CUDA
*/
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Using Libfabric 1.18 API, with GPUDirect RDMA support");
support_gdr = GDR_UNKNOWN;
} else if (ret == -FI_ENODATA) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "No eligible providers were found");
goto error;
} else {
NCCL_OFI_WARN("OFI fi_getinfo() call failed: %s", fi_strerror(ret));
goto error;
}
fi_freeinfo(hints);
ret = nccl_net_ofi_query_provider_capabilities(provider_list, num_providers);
if (ret != 0) {
NCCL_OFI_WARN("Querying provider capabilities failed: %d", ret);
goto error;
}
if (endpoint_mr) {
NCCL_OFI_WARN("RDMA protocol does not support endpoint memory registration.");
ret = -ENOTSUP;
goto error;
}
if ((ssize_t)ofi_nccl_eager_max_size() > (ssize_t)ofi_nccl_min_stripe_size()) {
NCCL_OFI_WARN("Invalid value for EAGER_MAX_SIZE");
ret = ncclInvalidArgument;
goto error;
}
/*
* NCCL Net v9 API Optimization for LL/LL128 Protocols
*
* Background:
* When using LL (Low Latency) or LL128 protocols, NCCL sets the request pointer
* to NCCL_NET_OPTIONAL_RECV_COMPLETION in irecv() calls. This indicates that
* the plugin can complete a receiver request early without plugin explicitly
* polling the CQ to validate data arrival. This is achievable because NCCL itself
* following LL protocol semantics will validate data arrival by checking the flag bytes.
*
* Plugin Optimization Details:
* 1. Receiver Side:
* - Marks request completion immediately after CTRL message send completion
* - Does not wait for RDMA write operation completion
*
* 2. Sender Side:
* - Uses fi_write instead of fi_writedata, to eliminate unnecessary CQ entries on RX side
*
* Requirements:
* - Eager msg mode is diabled: eager_max_size == -1
* - Provider must use FI_PROGRESS_AUTO data progress model
*/
if (ofi_nccl_early_completion() < 0) {
if (!data_progress_auto) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Early completion disabled due to progress model");
early_completion = false;
} else if (ofi_nccl_eager_max_size() >= 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Early completion disabled because eager is enabled");
early_completion = false;
} else {
early_completion = true;
}
} else if (ofi_nccl_early_completion() == 0) {
early_completion = false;
} else {
if (!data_progress_auto) {
NCCL_OFI_WARN("Failed configuration of EARLY_COMPLETION due to provider data progress model is not FI_PROGRESS_AUTO");
ret = -ENOTSUP;
goto error;
}
early_completion = true;
}
if (early_completion && ofi_nccl_eager_max_size() != -1) {
NCCL_OFI_WARN("Conflicted configuration of EARLY_COMPLETION and EAGER_MAX_SIZE");
ret = -ENOTSUP;
goto error;
}
/* Create NCCL OFI topology */
topo = nccl_ofi_topo_create(provider_list);
if (!topo) {
NCCL_OFI_WARN("Failed to create NCCL OFI topology");
ret = -ENOTSUP;
goto error;
}
ret = nccl_ofi_topo_group(topo);
if (ret != 0) {
NCCL_OFI_WARN("Failed to group NICs");
goto error;
}
if (topo->max_group_size > MAX_NUM_RAILS) {
NCCL_OFI_WARN("Unexpected topo group size of %d (maximum %d)",
topo->max_group_size, MAX_NUM_RAILS);
ret = -EINVAL;
goto error;
}
if (topo->max_group_size < 1) {
NCCL_OFI_WARN("Unexpected group size %d", topo->max_group_size);
ret = -EINVAL;
goto error;
}
if (topo->max_group_size > 1) {
*found_multiple_rails = true;
}
/**
* NCCL's topology detection will set NIC PCIe link speed based on the
* "leader" NIC for the GPU. For multi-rail platforms, we increase the
* link speed reported to NCCL to account for the other rails. This
* requires generating a topology file that will be passed to NCCL.
*/
if (topo->max_group_size > 1) {
ret = write_topo_file(topo);
if (ret != 0) {
NCCL_OFI_WARN("Failed to write NCCL topology file");
goto error;
}
}
ret = nccl_ofi_topo_num_info_lists(topo, &num_devs);
if (ret != 0) {
goto error;
} else if (num_devs <= 0) {
NCCL_OFI_WARN("Topology reported unexpected number of devices. "
"Expected value larger than zero but got %i",
num_devs);
ret = -EINVAL;;
goto error;
}
ret = nccl_net_ofi_rdma_plugin_create(num_devs, topo, &plugin);
if (ret != 0) {
NCCL_OFI_WARN("Unable to allocate nccl_net_ofi_plugin_t");
goto error;
}
cpu_cache_line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
if (cpu_cache_line_size < 0) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
"Unable to obtain CPU cache line size from sysconf. "
"fallback to predefined value %llu",
NCCL_OFI_DEFAULT_CPU_CACHE_LINE_SIZE);
cpu_cache_line_size = NCCL_OFI_DEFAULT_CPU_CACHE_LINE_SIZE;
}
*plugin_p = &plugin->base;
return ret;
error:
if (plugin != NULL) {
plugin->base.release_plugin(&plugin->base);
plugin = NULL;
}
return ret;
}