in src/nccl_ofi_net.c [984:1080]
static ncclResult_t ofi_init(ncclDebugLogger_t logFunction)
{
ncclResult_t ret = ncclSuccess;
char *prov_include = NULL;
int idx, rc;
ofi_log_function = logFunction;
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using " PACKAGE_STRING);
/*
* RDMAV_FORK_SAFE environment variable makes the rdma-core
* library fork-safe. This significantly increases cost of memory
* registration when huge pages are enabled.
*
* To prevent data corruption, the EFA provider registers an atfork
* handler which will abort the process whenever it believes
* rdma-core is not fork-safe.
*
* NCCL applications heavily re-use the buffers for communication and
* thus are not sensitive to increased memory registration costs.
* To prevent NCCL based applications from getting aborted when using
* fork(), plugin explicitly enables RDMAV_FORK_SAFE environment
* variable.
*/
if (!getenv("RDMAV_FORK_SAFE")) {
NCCL_OFI_INFO(NCCL_INIT, "Setting RDMAV_FORK_SAFE environment variable to 1.");
rc = setenv("RDMAV_FORK_SAFE", "1", 1);
if (rc != 0) {
NCCL_OFI_WARN("Unable to set RDMAV_FORK_SAFE");
ret = ncclSystemError;
goto exit;
}
}
/* Get list of NICs fi_info structures for a single provider */
ret = get_ofi_provider(prov_include, &ofi_info_list);
if (ret != 0 || ofi_info_list == NULL) {
ret = ncclSystemError;
goto exit;
}
/* If TCP provider is selected, filter out unnecessary interfaces and address formats */
if (strncmp("tcp", ofi_info_list->fabric_attr->prov_name, strlen("tcp")) == 0) {
filter_tcp_info_list();
if (OFI_UNLIKELY(ofi_info_list == NULL)) {
NCCL_OFI_WARN("No viable endpoint found for TCP provider. Try and relax the filters using OFI_NCCL_USE_IPV6_TCP or OFI_NCCL_EXCLUDE_TCP_IF environment variables");
ret = ncclSystemError;
goto exit;
}
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Selected Provider is %s",
ofi_info_list->fabric_attr->prov_name);
/* Check if provider requires local memory registration */
if (ofi_info_list->domain_attr->mr_mode & FI_MR_LOCAL) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s requires registration of local memory buffers",
ofi_info_list->fabric_attr->prov_name);
local_mr = true;
} else {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s does not require registration of local memory buffers",
ofi_info_list->fabric_attr->prov_name);
}
/* Check if provider requires heterogeneous memory registration */
if (ofi_info_list->domain_attr->mr_mode & FI_MR_HMEM) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s requires registration of device buffers",
ofi_info_list->fabric_attr->prov_name);
hmem_mr = true;
} else {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Provider %s does not require registration of device buffers",
ofi_info_list->fabric_attr->prov_name);
}
/*
* Allocate NCCL OFI component array. Individual components are
* allocated as we use them.
*/
nccl_ofi_component =
(nccl_ofi_t **)malloc(sizeof(nccl_ofi_t *) * ofi_ndevices);
if (OFI_UNLIKELY(nccl_ofi_component == NULL)) {
NCCL_OFI_WARN("Unable to allocate nccl_ofi_component");
ret = ncclSystemError;
goto exit;
}
for (idx = 0; idx < ofi_ndevices; idx++) {
nccl_ofi_component[idx] = NULL;
}
exit:
if (ret != ncclSuccess) {
NCCL_OFI_WARN(PACKAGE_NAME " initialization failed");
}
return ret;
}