in src/nccl_ofi_topo.cpp [792:875]
static int create_groups_from_info_list(nccl_ofi_topo_t *topo,
struct fi_info **info_list,
int num_infos,
hwloc_obj_t gpu_group_node,
int num_groups)
{
int ret = 0;
int group_idx = 0;
/* Adjust number of groups if input list does not provide enough members */
num_groups = std::min(num_groups, num_infos);
/* Number of groups with one additional member. Handles the
* case where list size is not a multiple of number of
* groups */
const int num_large_groups = num_infos % num_groups;
int group_size = num_infos / num_groups + 1;
/* sort the provider list to match network rail ordering. See
* the documentation comment for platform_sort_rails() for
* more information. We do this here so that there is
* consistency
*/
if (platform_sort_rails != NULL) {
platform_sort_rails(info_list, (size_t)num_infos, (size_t)group_size);
}
for (; group_idx < num_groups; ++group_idx) {
hwloc_obj_t obj;
/* If the number of NIC infos is not a multiple of
* group size, latter candidates have one candidate
* less. */
if (group_idx == num_large_groups) --group_size;
if (group_size == 0) break;
/* Retrieve topology node of leader */
ret = get_hwloc_pcidev_by_fi_info(topo->topo, *info_list, &obj);
if (ret != 0) {
NCCL_OFI_WARN("Retrieval of topology node corresponding to libfabric NIC failed with error");
break;
}
if (!obj) {
NCCL_OFI_WARN("hwloc failed detecting PCI NIC info.");
ret = -EINVAL;
break;
}
nccl_ofi_topo_data_t *user_data = (nccl_ofi_topo_data_t *)obj->userdata;
if (!user_data) {
NCCL_OFI_WARN("Invalid user data pointer");
return -EINVAL;
}
if (user_data->info_list == *info_list) {
if (group_idx + 1 == num_groups) {
break;
} else {
NCCL_OFI_WARN("Invalid state of topology. "
"This state should not be reached.");
return -EINVAL;
}
}
/* Add list topology node */
user_data->info_list = *info_list;
user_data->info_list_len = group_size;
user_data->gpu_group_node = gpu_group_node;
/* Track maximum group size */
topo->max_group_size = std::max(topo->max_group_size, group_size);
/* Cut list into two lists after group size list elements */
struct fi_info *end = user_data->info_list;
int i = 1;
for (; i < group_size; ++i) {
end = end->next;
}
/* Move list remainder to input list */
*info_list = end->next;
end->next = NULL;
}
return ret;
}