int nccl_net_ofi_info

int nccl_net_ofi_info_properties()

in src/nccl_ofi_net.cpp [452:650]
119 lines of code
29 McCabe index (conditional complexity)

int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *nic_prov,
				 int dev_id, int num_devices, nccl_ofi_properties_t *props)
{
	int ret = 0;
	struct fid_nic *nic_info = NULL;
	const char *platform_type = NULL;

	memset(props, 0, sizeof(*props));

	ret = set_nic_props_default(dev_id, nic_prov, props);
	if (ret != 0) {
		goto error;
	}

	/* Change default values as set by NIC attributes */
	nic_info = (struct fid_nic *)nic_prov->nic;
	if (nic_info == NULL) {
		NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
			      "No NIC info for dev %d. Supplying default values for NIC properties.",
			      dev_id);
		ret = 0;
		goto exit;
	}

	/* name is NULL if device is a part of multirail config */
	/* overriding default name only if value is available from provider */
	if (nic_info->device_attr->name) {
		if (props->name) {
			free(props->name);
		}
		props->name = strdup(nic_info->device_attr->name);
		assert(props->name != NULL);
	}

	/*
	 * Determine the scope of MRs for providers to report global registration
	 * support to NCCL.
	 * NCCL uses regIsGlobal to determine support for User Registrations via
	 * the NCCL API. If providers tie MRs to endpoints, the plugin can not
	 * support this model (since NCCL maintains a per-domain registration
	 * cache which requires (domain-)global registrations.
	 * Also, if we have different domains for different threads, registrations
	 * are not reported as global even if they are tied to the domain.
	 */
	if (nic_prov->domain_attr->mr_mode & FI_MR_ENDPOINT || plugin->domain_per_thread) {
		props->regIsGlobal = 0;
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Global registrations are not supported");
	} else {
		props->regIsGlobal = 1;
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Global registrations supported");
	}

	/* Speed reported in Mbps */
	props->port_speed = nic_info->link_attr->speed / (1e6);

	/*
	 * When running on AWS, newer platforms might return incorrect link
	 * speeds when running a version of the driver that does not contain
	 * this change to query the device:
	 * https://github.com/amzn/amzn-drivers/commit/c4c7926561741c97f78e27836f5687bf16c54b23
	 * AND running a version of libfabric that does not contain this change:
	 * https://github.com/ofiwg/libfabric/pull/10496/commits/fd0c5f0b0abe91fc062ad57834a93f35278d2392
	 *
	 * Until these updates are more widely deployed, the following override
	 * fixes port_speed for impacted platforms.
	 */
	platform_type = nccl_net_ofi_get_product_name();
	if (platform_type != NULL && strcmp(platform_type, "p5en.48xlarge") == 0) {
		NCCL_OFI_TRACE(NCCL_INIT, "Overriding OFI link_attr speed to 200Gbps/link for P5en platform");
		props->port_speed = 200 * (1e3);
	}

	ret = get_device_pci_path(nic_info, &props->pci_path);
	if (ret != 0) {
		ret = 0;
		props->pci_path = NULL;
	}

	if (nic_dup_conns > 1) {
#if HAVE_CUDA
		int num_gpus_visible = nccl_net_ofi_cuda_get_num_devices();
		int active_cuda_device = nccl_net_ofi_cuda_get_active_device_idx();
		int gpus_per_conn = -1;
		int c = 0;

		if (!(num_gpus_visible > 0)) {
			NCCL_OFI_WARN("Error getting CUDA device count");
			ret = -ENOTSUP;
			goto error;
		}

		if (active_cuda_device < 0 || active_cuda_device >= num_gpus_visible) {
			NCCL_OFI_WARN("Error getting current CUDA device");
			ret = -ENOTSUP;
			goto error;
		}

		gpus_per_conn = num_gpus_visible / num_devices;
		if (gpus_per_conn == 0) gpus_per_conn = 1;

		/* The goal is to have gpus_per_conn gpus in the local
		 * system think that any given virtual nic is the one
		 * that they should use, and that it is different than
		 * the other NICs in the system.  We do this by only
		 * leaving a valid device id in pci_path when
		 * active_cuda_device / gpus_per_comm is equal to the
		 * NIC dev index we're currently querying.  For the
		 * others, we provide a PCIPath that points at the PCI
		 * Bus itself, which NCCL will interpret to be on a
		 * different complex than the bus (assuming the NIC
		 * BUS and GPU BUS are the same).
		 *
		 * There are a bunch of assumptions in this logic,
		 * such that the physical NICs in the system don't
		 * have PCI affinity with the GPUs.  Given that we've
		 * already established that GPUDirect doesn't work,
		 * this is probably ok; any affinity is lost by
		 * bouncing through host buffers anyway.
		 */
		if ((active_cuda_device / gpus_per_conn != dev_id) && props->pci_path) {
			for (c = strlen(props->pci_path); props->pci_path[c] != '/'; c--) {
				props->pci_path[c] = '\0';
			}
		}
		NCCL_OFI_TRACE(NCCL_INIT,
			       "Returning synthetic PCI path for device %d of %s",
			       dev_id,
			       props->pci_path);

		snprintf(props->name,
			 FI_NAME_MAX + 2,
			 "%s-%x",
			 nic_info->device_attr->name,
			 dev_id);
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
			       "Adjusted dev %d device name to %s",
			       dev_id,
			       props->name);
#else
		NCCL_OFI_WARN("NIC_DUP_CONNS enabled on platform that does not support NIC_DUP_CONNS.  This should not happen.");
		ret = -ENOTSUP;
		goto error;
#endif
	}

	props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;

	props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) &&
		FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) &&
		nccl_ofi_dmabuf_viable()
		;
	if (props->dmabuf_support && strncmp("efa", nic_prov->fabric_attr->prov_name, strlen("efa")) == 0) {
		// Generations 1-3 of EFA have a firmware issue that can result
		// in communication failures with MRs that cover a large number
		// of page entries.  This is not usually a problem, because page
		// merging greatly reduces the number of page entries in the MR.
		// However, the RDMA subsystem in the Linux kernel did not
		// properly execute page merging for dmabuf entries until a
		// recent patch
		// (https://web.git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=486055f5e09df9),
		// and the lack of page merging increased the probability of
		// hitting the EFA issue.  Testing for the fixed kernel version
		// is effectively impossible (the issue can also be fixed in the
		// EFA kmod itself, and backports are likely, so a simple kernel
		// version check is insufficient), so instead we only support
		// dmabuf by default in Generation 4 of EFA.  When the
		// communication failure issue is resolved in previous
		// generations, this code will be removed and dmabuf will be
		// available by default everywhere.
		if (nic_prov->nic == NULL || nic_prov->nic->device_attr == NULL) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
				       "DMA-BUF disabled due to missing nic data");
			props->dmabuf_support = false;
		} else if (strcmp("0xefa0", nic_prov->nic->device_attr->device_id) == 0 ||
			   strcmp("0xefa1", nic_prov->nic->device_attr->device_id) == 0 ||
			   strcmp("0xefa2", nic_prov->nic->device_attr->device_id) == 0) {
			NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
				       "DMA-BUF disabled due to EFA device id %s",
				       nic_prov->nic->device_attr->device_id);
			props->dmabuf_support = false;
		}
	}

	if (props->dmabuf_support) {
		NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties.");
	}

	goto exit;
error:
	if (props->pci_path) {
		free(props->pci_path);
	}
	if (props->name) {
		free(props->name);
	}

exit:
	return ret;
}