int platform_init()

in src/platform-aws.cpp [408:594]


int platform_init(const char **provider_filter)
{
	int ret = ncclSuccess;
	struct ec2_platform_data *platform_data;
	bool select_efa = false;
	char *fi_provider;

	NCCL_OFI_INFO(NCCL_INIT, "Configuring AWS-specific options");

	platform_data = get_platform_data();

	/* if we're here, we think we're on an EC2 instance, so force
	 * EFA provider (for platforms without EFA, this will cause a
	 * fallback to NCCL's internal TCP.  In the case of Neuron, a
	 * hard failure when there are no NICs.  Both are the
	 * behaviors we want).
	 */
	fi_provider = getenv("FI_PROVIDER");
	if (fi_provider == NULL) {
		NCCL_OFI_INFO(NCCL_INIT, "Setting provider_filter to efa");
		*provider_filter = "efa";
		select_efa = true;
	} else if (0 == strcmp(fi_provider, "efa")) {
		select_efa = true;
	}

#if HAVE_CUDA
	/*
	 * FI_EFA_FORK_SAFE environment variable tells Libfabric to enable
	 * fork-safe support in legacy versions of the rdma-core library.
	 * Libfabric checks if additional handling is required for fork safety,
	 * and does not introduce this additional overhead of setting MADV_DONTFORK
	 * for new versions of rdma-core (38.0 and later) and the Linux kernel
	 * that support copy-on-fork for pinned memory (5.13 and later).
	 * These new versions are always fork-safe and additional support in userspace
	 * is not required.
	 *
	 * When legacy versions of the kernel and rdma-core are used, setting
	 * FI_EFA_FORK_SAFE to 1 disables the use of huge pages in Libfabric.
	 *
	 * To prevent data corruption, the EFA provider registers an atfork
	 * handler which will abort the process whenever it believes
	 * rdma-core is not fork-safe.
	 *
	 * NCCL applications heavily re-use the buffers for communication and
	 * thus are not sensitive to increased memory registration costs.
	 * To prevent NCCL based applications from getting aborted when using
	 * fork(), the plugin explicitly enables FI_EFA_FORK_SAFE environment
	 * variable, even in legacy environments where the overhead is high.
	 *
	 * The Neuron team has asked us to skip trying to set this
	 * environment variable on Neuron platforms, so we only do
	 * this for Nvidia platforms.
	 */
	uint32_t libversion = fi_version();
	const char * fork_safe_var_name =
		(FI_MAJOR(libversion) > 1 || (FI_MAJOR(libversion) == 1 && FI_MINOR(libversion) >= 13))
		? "FI_EFA_FORK_SAFE"
		: "RDMAV_FORK_SAFE";
	if (!getenv(fork_safe_var_name)) {
		NCCL_OFI_INFO(NCCL_INIT, "Setting %s environment variable to 1", fork_safe_var_name);
		ret = setenv(fork_safe_var_name, "1", 1);
		if (ret != 0) {
			NCCL_OFI_WARN("Unable to set %s", fork_safe_var_name);
			ret = -errno;
			goto exit;
		}
	}

	ret = configure_nvls_option();
	if (ret != 0) {
		NCCL_OFI_WARN("Unable to configure NVLS option");
		goto exit;
	}

	if ((platform_data && !platform_data->net_flush_required) &&
	    NULL == getenv("NCCL_NET_FORCE_FLUSH")) {

		/* Hopper GPUs do not require a network flush, but NCCL versions <2.19.1
		* still enable flush by default on any GPU type.
		* For GPU generations earlier than Hopper, NCCL always enables flush, while
		* for Hopper GPUs flush is enabled or disabled depending on the value of
		* the NCCL_NET_FORCE_FLUSH environment variable. The default value for this
		* variable is 1 for NCCL versions <2.19.1, which forces flush when it is not
		* needed, so it is safe to set it to 0 if it is not explicitly set.
		*/

		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NET_FORCE_FLUSH=0 for Hopper GPUs");
		ret = setenv("NCCL_NET_FORCE_FLUSH", "0", 0);
		if (ret != 0) {
			NCCL_OFI_WARN("Unable to set NCCL_NET_FORCE_FLUSH");
			ret = -errno;
			goto exit;
		}
	}

	/*
	 * NCCL v2.19.3 reduced the chunk size used when running NVLS Tree
	 * algorithm on greater than 4 nodes to 64KiB. This drastically impacted
	 * performance on AWS (Ref: https://github.com/NVIDIA/nccl/pull/1112/
	 * for some data). NCCL v2.20.3 has made this a tunable. Based on
	 * empirical testing, a max chunk size of 512KiB recovers from the
	 * regression and was also observed to be the default in v2.19.3.
	 * Setting this unconditionally without relying on ncclGetVersion symbol
	 * being available, since the parameter did not exist in versions prior
	 * to v2.20.
	 *
	 * The NVLSTree chunk size can not be larger than the NVLS chunk size,
	 * so we ensure both are set to 512KiB.
	 */
	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NVLSTREE_MAX_CHUNKSIZE to 512KiB");
	ret = setenv("NCCL_NVLSTREE_MAX_CHUNKSIZE", "524288", 0);
	if (ret != 0) {
		NCCL_OFI_WARN("Unable to set NCCL_NVLSTREE_MAX_CHUNKSIZE");
		ret = -errno;
		goto exit;
	}

	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Setting NCCL_NVLS_CHUNKSIZE to 512KiB");
	ret = setenv("NCCL_NVLS_CHUNKSIZE", "524288", 0);
	if (ret != 0) {
		NCCL_OFI_WARN("Unable to set NCCL_NVLS_CHUNKSIZE");
		ret = -errno;
		goto exit;
	}
#endif

	/*
	 * Update topology if platform topology is available and 
	 * environment variable NCCL_TOPO_FILE is not set.
	 */
	if (getenv("NCCL_TOPO_FILE")) {
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
			      "Running on %s platform, NCCL_TOPO_FILE environment variable is already set to %s",
			      nccl_net_ofi_get_product_name(), getenv("NCCL_TOPO_FILE"));
	} else if (platform_data && platform_data->topology) {
		char topology_path[PATH_MAX];

		ret = snprintf(topology_path, sizeof(topology_path), "%s/%s",
			       XML_DIR, platform_data->topology);
		if (ret < 0 || (size_t)ret >= sizeof(topology_path)) {
			NCCL_OFI_WARN("Error occurred while forming the complete topology XML file path. RC: %d, Buffer Size: %d, XML dir: %s, Topology file: %s",
				      ret, PATH_MAX, XML_DIR, platform_data->topology);
			ret = -ENOMEM;
			goto exit;
		}

		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
				"Running on %s platform, Setting NCCL_TOPO_FILE environment variable to %s",
				nccl_net_ofi_get_product_name(), topology_path);

		ret = setenv("NCCL_TOPO_FILE", topology_path, 1);
		if (ret != 0) {
			NCCL_OFI_WARN("Unable to set NCCL_TOPO_FILE");
			ret = -errno;
			goto exit;
		}

	}

	if (nic_dup_conns == 0 && platform_data)
		nic_dup_conns = platform_data->default_dup_conns;

	if (ofi_nccl_net_latency() < 0) {
		if (platform_data && platform_data->latency >= 0.0) {
			net_latency = platform_data->latency;
		} else {
			/*
			 * Empirical testing on P5 had shown that NCCL's
			 * internal tuner choices work better with this value.
			 * While this needs to be revisited for newer
			 * generations of EFA, using it as the fall-through
			 * default for undefined platforms.
			 */
			net_latency = 75.0;
		}
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Internode latency set at %.1f us",
				net_latency);
	}

	if (select_efa && ofi_nccl_protocol() == NULL && platform_data) {
		nccl_ofi_selected_protocol = platform_data->default_protocol;
	}

exit:
	return ret;
}