static int set_user_data()

in src/nccl_ofi_topo.cpp [466:535]


static int set_user_data(nccl_ofi_topo_t *ofi_topo,
				  struct fi_info *info_list)
{
	int ret = 0;
	hwloc_obj_t obj = NULL;
	nccl_ofi_topo_data_iterator_t data_iter;

	/* Retrieve number of topology nodes that have a Nvidia GPU or a NIC in their subtree */
	int num_nodes = 0;
	ret = count_nodes_with_accel_or_nic_in_subtree(ofi_topo->topo, info_list, &num_nodes);
	if (ret != 0) {
		NCCL_OFI_WARN("Failed counting number of nodes that have a Nvidia GPU or NIC in their subtree.");
		return ret;
	}

	/* Create vector that provides one user data struct for each
	 * topology node that has a Nvidia GPU or a NIC in its subtree */
	ofi_topo->data_vec = nccl_ofi_topo_data_vec_create(num_nodes);
	if (!ofi_topo->data_vec) {
		NCCL_OFI_WARN("Could not create user data vector.");
		return -ENOMEM;
	}
	nccl_ofi_topo_set_to_begin(ofi_topo, &data_iter);

	/* Iterate over all PCI topology nodes and find nodes
	 * corresponding to NICs and Nvidia GPUs. From those nodes,
	 * walk up towards the root and set user data. */
	while ((obj = hwloc_get_next_pcidev(ofi_topo->topo, obj))) {
		bool is_accel = false;
		struct fi_info *info;

		ret = is_accelerator_dev(obj, &is_accel);
		if (ret != 0) {
			NCCL_OFI_WARN("Error while checking whether hwloc topology node is nvidia GPU");
			return ret;
		}

		ret = get_info_for_node(obj, info_list, &info);
		if (ret != 0) {
			NCCL_OFI_WARN("Error while retrieving libfabric NIC info struct corresponding to hwloc topology node");
			return ret;
		}

		if (is_accel || info) {
			ret = set_userdata_to_root(obj, &data_iter);
			if (ret != 0) {
				NCCL_OFI_WARN("Error while setting user data on path to root");
				return ret;
			}
		}

		if (info) {
			/* Copy libfabric NIC info struct and store info struct in
			 * user data of topology node */
			nccl_ofi_topo_data_t *user_data = (nccl_ofi_topo_data_t *)obj->userdata;
			user_data->info_list = fi_dupinfo(info);
			user_data->info_list_len = 1;

			if (!user_data->info_list) {
				NCCL_OFI_WARN("Unable to duplicate libfabric NIC info");
				return -EINVAL;
			}

			ofi_topo->max_group_size = 1;
		}

	}

	return 0;
}