in src/platform-aws.cpp [408:594]
int platform_init(const char **provider_filter)
{
int ret = ncclSuccess;
struct ec2_platform_data *platform_data;
bool select_efa = false;
char *fi_provider;
NCCL_OFI_INFO(NCCL_INIT, "Configuring AWS-specific options");
platform_data = get_platform_data();
/* if we're here, we think we're on an EC2 instance, so force
* EFA provider (for platforms without EFA, this will cause a
* fallback to NCCL's internal TCP. In the case of Neuron, a
* hard failure when there are no NICs. Both are the
* behaviors we want).
*/
fi_provider = getenv("FI_PROVIDER");
if (fi_provider == NULL) {
NCCL_OFI_INFO(NCCL_INIT, "Setting provider_filter to efa");
*provider_filter = "efa";
select_efa = true;
} else if (0 == strcmp(fi_provider, "efa")) {
select_efa = true;
}
#if HAVE_CUDA
/*
* FI_EFA_FORK_SAFE environment variable tells Libfabric to enable
* fork-safe support in legacy versions of the rdma-core library.
* Libfabric checks if additional handling is required for fork safety,
* and does not introduce this additional overhead of setting MADV_DONTFORK
* for new versions of rdma-core (38.0 and later) and the Linux kernel
* that support copy-on-fork for pinned memory (5.13 and later).
* These new versions are always fork-safe and additional support in userspace
* is not required.
*
* When legacy versions of the kernel and rdma-core are used, setting
* FI_EFA_FORK_SAFE to 1 disables the use of huge pages in Libfabric.
*
* To prevent data corruption, the EFA provider registers an atfork
* handler which will abort the process whenever it believes
* rdma-core is not fork-safe.
*
* NCCL applications heavily re-use the buffers for communication and
* thus are not sensitive to increased memory registration costs.
* To prevent NCCL based applications from getting aborted when using
* fork(), the plugin explicitly enables FI_EFA_FORK_SAFE environment
* variable, even in legacy environments where the overhead is high.
*
* The Neuron team has asked us to skip trying to set this
* environment variable on Neuron platforms, so we only do
* this for Nvidia platforms.
*/
uint32_t libversion = fi_version();
const char * fork_safe_var_name =
(FI_MAJOR(libversion) > 1 || (FI_MAJOR(libversion) == 1 && FI_MINOR(libversion) >= 13))
? "FI_EFA_FORK_SAFE"
: "RDMAV_FORK_SAFE";
if (!getenv(fork_safe_var_name)) {
NCCL_OFI_INFO(NCCL_INIT, "Setting %s environment variable to 1", fork_safe_var_name);
ret = setenv(fork_safe_var_name, "1", 1);
if (ret != 0) {
NCCL_OFI_WARN("Unable to set %s", fork_safe_var_name);
ret = -errno;
goto exit;
}
}
ret = configure_nvls_option();
if (ret != 0) {
NCCL_OFI_WARN("Unable to configure NVLS option");
goto exit;
}
if ((platform_data && !platform_data->net_flush_required) &&
NULL == getenv("NCCL_NET_FORCE_FLUSH")) {
/* Hopper GPUs do not require a network flush, but NCCL versions <2.19.1
* still enable flush by default on any GPU type.
* For GPU generations earlier than Hopper, NCCL always enables flush, while
* for Hopper GPUs flush is enabled or disabled depending on the value of
* the NCCL_NET_FORCE_FLUSH environment variable. The default value for this
* variable is 1 for NCCL versions <2.19.1, which forces flush when it is not
* needed, so it is safe to set it to 0 if it is not explicitly set.
*/
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NET_FORCE_FLUSH=0 for Hopper GPUs");
ret = setenv("NCCL_NET_FORCE_FLUSH", "0", 0);
if (ret != 0) {
NCCL_OFI_WARN("Unable to set NCCL_NET_FORCE_FLUSH");
ret = -errno;
goto exit;
}
}
/*
* NCCL v2.19.3 reduced the chunk size used when running NVLS Tree
* algorithm on greater than 4 nodes to 64KiB. This drastically impacted
* performance on AWS (Ref: https://github.com/NVIDIA/nccl/pull/1112/
* for some data). NCCL v2.20.3 has made this a tunable. Based on
* empirical testing, a max chunk size of 512KiB recovers from the
* regression and was also observed to be the default in v2.19.3.
* Setting this unconditionally without relying on ncclGetVersion symbol
* being available, since the parameter did not exist in versions prior
* to v2.20.
*
* The NVLSTree chunk size can not be larger than the NVLS chunk size,
* so we ensure both are set to 512KiB.
*/
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NVLSTREE_MAX_CHUNKSIZE to 512KiB");
ret = setenv("NCCL_NVLSTREE_MAX_CHUNKSIZE", "524288", 0);
if (ret != 0) {
NCCL_OFI_WARN("Unable to set NCCL_NVLSTREE_MAX_CHUNKSIZE");
ret = -errno;
goto exit;
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NVLS_CHUNKSIZE to 512KiB");
ret = setenv("NCCL_NVLS_CHUNKSIZE", "524288", 0);
if (ret != 0) {
NCCL_OFI_WARN("Unable to set NCCL_NVLS_CHUNKSIZE");
ret = -errno;
goto exit;
}
#endif
/*
* Update topology if platform topology is available and
* environment variable NCCL_TOPO_FILE is not set.
*/
if (getenv("NCCL_TOPO_FILE")) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
"Running on %s platform, NCCL_TOPO_FILE environment variable is already set to %s",
nccl_net_ofi_get_product_name(), getenv("NCCL_TOPO_FILE"));
} else if (platform_data && platform_data->topology) {
char topology_path[PATH_MAX];
ret = snprintf(topology_path, sizeof(topology_path), "%s/%s",
XML_DIR, platform_data->topology);
if (ret < 0 || (size_t)ret >= sizeof(topology_path)) {
NCCL_OFI_WARN("Error occurred while forming the complete topology XML file path. RC: %d, Buffer Size: %d, XML dir: %s, Topology file: %s",
ret, PATH_MAX, XML_DIR, platform_data->topology);
ret = -ENOMEM;
goto exit;
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
"Running on %s platform, Setting NCCL_TOPO_FILE environment variable to %s",
nccl_net_ofi_get_product_name(), topology_path);
ret = setenv("NCCL_TOPO_FILE", topology_path, 1);
if (ret != 0) {
NCCL_OFI_WARN("Unable to set NCCL_TOPO_FILE");
ret = -errno;
goto exit;
}
}
if (nic_dup_conns == 0 && platform_data)
nic_dup_conns = platform_data->default_dup_conns;
if (ofi_nccl_net_latency() < 0) {
if (platform_data && platform_data->latency >= 0.0) {
net_latency = platform_data->latency;
} else {
/*
* Empirical testing on P5 had shown that NCCL's
* internal tuner choices work better with this value.
* While this needs to be revisited for newer
* generations of EFA, using it as the fall-through
* default for undefined platforms.
*/
net_latency = 75.0;
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Internode latency set at %.1f us",
net_latency);
}
if (select_efa && ofi_nccl_protocol() == NULL && platform_data) {
nccl_ofi_selected_protocol = platform_data->default_protocol;
}
exit:
return ret;
}