in src/nccl_ofi_net.cpp [452:650]
int nccl_net_ofi_info_properties(nccl_net_ofi_plugin_t *plugin, struct fi_info *nic_prov,
int dev_id, int num_devices, nccl_ofi_properties_t *props)
{
int ret = 0;
struct fid_nic *nic_info = NULL;
const char *platform_type = NULL;
memset(props, 0, sizeof(*props));
ret = set_nic_props_default(dev_id, nic_prov, props);
if (ret != 0) {
goto error;
}
/* Change default values as set by NIC attributes */
nic_info = (struct fid_nic *)nic_prov->nic;
if (nic_info == NULL) {
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
"No NIC info for dev %d. Supplying default values for NIC properties.",
dev_id);
ret = 0;
goto exit;
}
/* name is NULL if device is a part of multirail config */
/* overriding default name only if value is available from provider */
if (nic_info->device_attr->name) {
if (props->name) {
free(props->name);
}
props->name = strdup(nic_info->device_attr->name);
assert(props->name != NULL);
}
/*
* Determine the scope of MRs for providers to report global registration
* support to NCCL.
* NCCL uses regIsGlobal to determine support for User Registrations via
* the NCCL API. If providers tie MRs to endpoints, the plugin can not
* support this model (since NCCL maintains a per-domain registration
* cache which requires (domain-)global registrations.
* Also, if we have different domains for different threads, registrations
* are not reported as global even if they are tied to the domain.
*/
if (nic_prov->domain_attr->mr_mode & FI_MR_ENDPOINT || plugin->domain_per_thread) {
props->regIsGlobal = 0;
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Global registrations are not supported");
} else {
props->regIsGlobal = 1;
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Global registrations supported");
}
/* Speed reported in Mbps */
props->port_speed = nic_info->link_attr->speed / (1e6);
/*
* When running on AWS, newer platforms might return incorrect link
* speeds when running a version of the driver that does not contain
* this change to query the device:
* https://github.com/amzn/amzn-drivers/commit/c4c7926561741c97f78e27836f5687bf16c54b23
* AND running a version of libfabric that does not contain this change:
* https://github.com/ofiwg/libfabric/pull/10496/commits/fd0c5f0b0abe91fc062ad57834a93f35278d2392
*
* Until these updates are more widely deployed, the following override
* fixes port_speed for impacted platforms.
*/
platform_type = nccl_net_ofi_get_product_name();
if (platform_type != NULL && strcmp(platform_type, "p5en.48xlarge") == 0) {
NCCL_OFI_TRACE(NCCL_INIT, "Overriding OFI link_attr speed to 200Gbps/link for P5en platform");
props->port_speed = 200 * (1e3);
}
ret = get_device_pci_path(nic_info, &props->pci_path);
if (ret != 0) {
ret = 0;
props->pci_path = NULL;
}
if (nic_dup_conns > 1) {
#if HAVE_CUDA
int num_gpus_visible = nccl_net_ofi_cuda_get_num_devices();
int active_cuda_device = nccl_net_ofi_cuda_get_active_device_idx();
int gpus_per_conn = -1;
int c = 0;
if (!(num_gpus_visible > 0)) {
NCCL_OFI_WARN("Error getting CUDA device count");
ret = -ENOTSUP;
goto error;
}
if (active_cuda_device < 0 || active_cuda_device >= num_gpus_visible) {
NCCL_OFI_WARN("Error getting current CUDA device");
ret = -ENOTSUP;
goto error;
}
gpus_per_conn = num_gpus_visible / num_devices;
if (gpus_per_conn == 0) gpus_per_conn = 1;
/* The goal is to have gpus_per_conn gpus in the local
* system think that any given virtual nic is the one
* that they should use, and that it is different than
* the other NICs in the system. We do this by only
* leaving a valid device id in pci_path when
* active_cuda_device / gpus_per_comm is equal to the
* NIC dev index we're currently querying. For the
* others, we provide a PCIPath that points at the PCI
* Bus itself, which NCCL will interpret to be on a
* different complex than the bus (assuming the NIC
* BUS and GPU BUS are the same).
*
* There are a bunch of assumptions in this logic,
* such that the physical NICs in the system don't
* have PCI affinity with the GPUs. Given that we've
* already established that GPUDirect doesn't work,
* this is probably ok; any affinity is lost by
* bouncing through host buffers anyway.
*/
if ((active_cuda_device / gpus_per_conn != dev_id) && props->pci_path) {
for (c = strlen(props->pci_path); props->pci_path[c] != '/'; c--) {
props->pci_path[c] = '\0';
}
}
NCCL_OFI_TRACE(NCCL_INIT,
"Returning synthetic PCI path for device %d of %s",
dev_id,
props->pci_path);
snprintf(props->name,
FI_NAME_MAX + 2,
"%s-%x",
nic_info->device_attr->name,
dev_id);
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Adjusted dev %d device name to %s",
dev_id,
props->name);
#else
NCCL_OFI_WARN("NIC_DUP_CONNS enabled on platform that does not support NIC_DUP_CONNS. This should not happen.");
ret = -ENOTSUP;
goto error;
#endif
}
props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;
props->dmabuf_support = ((nic_prov->caps & FI_HMEM) != 0) &&
FI_VERSION_GE(nic_prov->fabric_attr->api_version, FI_VERSION(1, 20)) &&
nccl_ofi_dmabuf_viable()
;
if (props->dmabuf_support && strncmp("efa", nic_prov->fabric_attr->prov_name, strlen("efa")) == 0) {
// Generations 1-3 of EFA have a firmware issue that can result
// in communication failures with MRs that cover a large number
// of page entries. This is not usually a problem, because page
// merging greatly reduces the number of page entries in the MR.
// However, the RDMA subsystem in the Linux kernel did not
// properly execute page merging for dmabuf entries until a
// recent patch
// (https://web.git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=486055f5e09df9),
// and the lack of page merging increased the probability of
// hitting the EFA issue. Testing for the fixed kernel version
// is effectively impossible (the issue can also be fixed in the
// EFA kmod itself, and backports are likely, so a simple kernel
// version check is insufficient), so instead we only support
// dmabuf by default in Generation 4 of EFA. When the
// communication failure issue is resolved in previous
// generations, this code will be removed and dmabuf will be
// available by default everywhere.
if (nic_prov->nic == NULL || nic_prov->nic->device_attr == NULL) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"DMA-BUF disabled due to missing nic data");
props->dmabuf_support = false;
} else if (strcmp("0xefa0", nic_prov->nic->device_attr->device_id) == 0 ||
strcmp("0xefa1", nic_prov->nic->device_attr->device_id) == 0 ||
strcmp("0xefa2", nic_prov->nic->device_attr->device_id) == 0) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"DMA-BUF disabled due to EFA device id %s",
nic_prov->nic->device_attr->device_id);
props->dmabuf_support = false;
}
}
if (props->dmabuf_support) {
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "DMA-BUF support is advertised in properties.");
}
goto exit;
error:
if (props->pci_path) {
free(props->pci_path);
}
if (props->name) {
free(props->name);
}
exit:
return ret;
}