static int create_groups_from_info_list()

in src/nccl_ofi_topo.cpp [792:875]


static int create_groups_from_info_list(nccl_ofi_topo_t *topo,
						 struct fi_info **info_list,
						 int num_infos,
						 hwloc_obj_t gpu_group_node,
						 int num_groups)
{
	int ret = 0;
	int group_idx = 0;

	/* Adjust number of groups if input list does not provide enough members */
	num_groups = std::min(num_groups, num_infos);
	/* Number of groups with one additional member. Handles the
	 * case where list size is not a multiple of number of
	 * groups */
	const int num_large_groups = num_infos % num_groups;
	int group_size = num_infos / num_groups + 1;

	/* sort the provider list to match network rail ordering.  See
	 * the documentation comment for platform_sort_rails() for
	 * more information.  We do this here so that there is
	 * consistency
	 */
	if (platform_sort_rails != NULL) {
		platform_sort_rails(info_list, (size_t)num_infos, (size_t)group_size);
	}

	for (; group_idx < num_groups; ++group_idx) {
		hwloc_obj_t obj;
		/* If the number of NIC infos is not a multiple of
		 * group size, latter candidates have one candidate
		 * less. */
		if (group_idx == num_large_groups) --group_size;
		if (group_size == 0) break;

		/* Retrieve topology node of leader */
		ret = get_hwloc_pcidev_by_fi_info(topo->topo, *info_list, &obj);
		if (ret != 0) {
			NCCL_OFI_WARN("Retrieval of topology node corresponding to libfabric NIC failed with error");
			break;
		}
		if (!obj) {
			NCCL_OFI_WARN("hwloc failed detecting PCI NIC info.");
			ret = -EINVAL;
			break;
		}

		nccl_ofi_topo_data_t *user_data = (nccl_ofi_topo_data_t *)obj->userdata;
		if (!user_data) {
			NCCL_OFI_WARN("Invalid user data pointer");
			return -EINVAL;
		}

		if (user_data->info_list == *info_list) {
			if (group_idx + 1 == num_groups) {
				break;
			} else {
				NCCL_OFI_WARN("Invalid state of topology. "
					      "This state should not be reached.");
				return -EINVAL;
			}
		}

		/* Add list topology node */
		user_data->info_list = *info_list;
		user_data->info_list_len = group_size;
		user_data->gpu_group_node = gpu_group_node;

		/* Track maximum group size */
		topo->max_group_size = std::max(topo->max_group_size, group_size);

		/* Cut list into two lists after group size list elements */
		struct fi_info *end = user_data->info_list;
		int i = 1;
		for (; i < group_size; ++i) {
			end = end->next;
		}

		/* Move list remainder to input list */
		*info_list = end->next;
		end->next = NULL;
	}

	return ret;
}