in src/nccl_ofi_topo.cpp [466:535]
static int set_user_data(nccl_ofi_topo_t *ofi_topo,
struct fi_info *info_list)
{
int ret = 0;
hwloc_obj_t obj = NULL;
nccl_ofi_topo_data_iterator_t data_iter;
/* Retrieve number of topology nodes that have a Nvidia GPU or a NIC in their subtree */
int num_nodes = 0;
ret = count_nodes_with_accel_or_nic_in_subtree(ofi_topo->topo, info_list, &num_nodes);
if (ret != 0) {
NCCL_OFI_WARN("Failed counting number of nodes that have a Nvidia GPU or NIC in their subtree.");
return ret;
}
/* Create vector that provides one user data struct for each
* topology node that has a Nvidia GPU or a NIC in its subtree */
ofi_topo->data_vec = nccl_ofi_topo_data_vec_create(num_nodes);
if (!ofi_topo->data_vec) {
NCCL_OFI_WARN("Could not create user data vector.");
return -ENOMEM;
}
nccl_ofi_topo_set_to_begin(ofi_topo, &data_iter);
/* Iterate over all PCI topology nodes and find nodes
* corresponding to NICs and Nvidia GPUs. From those nodes,
* walk up towards the root and set user data. */
while ((obj = hwloc_get_next_pcidev(ofi_topo->topo, obj))) {
bool is_accel = false;
struct fi_info *info;
ret = is_accelerator_dev(obj, &is_accel);
if (ret != 0) {
NCCL_OFI_WARN("Error while checking whether hwloc topology node is nvidia GPU");
return ret;
}
ret = get_info_for_node(obj, info_list, &info);
if (ret != 0) {
NCCL_OFI_WARN("Error while retrieving libfabric NIC info struct corresponding to hwloc topology node");
return ret;
}
if (is_accel || info) {
ret = set_userdata_to_root(obj, &data_iter);
if (ret != 0) {
NCCL_OFI_WARN("Error while setting user data on path to root");
return ret;
}
}
if (info) {
/* Copy libfabric NIC info struct and store info struct in
* user data of topology node */
nccl_ofi_topo_data_t *user_data = (nccl_ofi_topo_data_t *)obj->userdata;
user_data->info_list = fi_dupinfo(info);
user_data->info_list_len = 1;
if (!user_data->info_list) {
NCCL_OFI_WARN("Unable to duplicate libfabric NIC info");
return -EINVAL;
}
ofi_topo->max_group_size = 1;
}
}
return 0;
}