optimum/amd/topology_utils.py (52 lines of code) (raw):

from itertools import combinations import amdsmi import numpy as np def get_bandwidth_matrix(): """ Returns a matrix of bandwidths between all GPU devices in the system. """ amdsmi.amdsmi_init() devices = amdsmi.amdsmi_get_device_handles() num_devices = len(devices) bandwidth_matrix = [[None for _ in range(num_devices)] for _ in range(num_devices)] # direct bandwidth for i, src_device in enumerate(devices): for j, dst_device in enumerate(devices): if i == j: bandwidth_matrix[i][j] = float("inf") else: try: curr_bandwidth = amdsmi.amdsmi_get_minmax_bandwidth(src_device, dst_device)["max_bandwidth"] if curr_bandwidth != 0: bandwidth_matrix[i][j] = curr_bandwidth except Exception: pass # indirect bandwidth for i in range(num_devices): for j in range(num_devices): if bandwidth_matrix[i][j] is None: maxmin_bandwidth = 0 for k in range(num_devices): if k == i or k == j: continue elif bandwidth_matrix[i][k] is not None and bandwidth_matrix[k][j] is not None: min_bandwidth = min(bandwidth_matrix[i][k], bandwidth_matrix[k][j]) if min_bandwidth > maxmin_bandwidth: bandwidth_matrix[i][j] = min_bandwidth maxmin_bandwidth = min_bandwidth # fill missing values for i in range(num_devices): for j in range(num_devices): if bandwidth_matrix[i][j] is None: bandwidth_matrix[i][j] = 0 amdsmi.amdsmi_shut_down() return bandwidth_matrix def extract_max_avg_bandwidth_cluster(bandwidth_matrix, cluster_num_devices): """ Returns the cluster of a given number of devices that has the maximum average bandwidth between devices. """ if len(bandwidth_matrix) < cluster_num_devices: raise ValueError("Number of devices in the cluster cannot be greater than the number of devices in the system") if cluster_num_devices == 1: return [0], float("inf") num_devices = range(len(bandwidth_matrix)) max_avg_bandwidth = 0 max_bandwidth_cluster = None for cluster in combinations(num_devices, cluster_num_devices): curr_bandwidth_matrix = [[bandwidth_matrix[i][j] for i in cluster] for j in cluster] curr_avg_bandwidth = np.mean(curr_bandwidth_matrix, where=~np.eye(len(curr_bandwidth_matrix), dtype=bool)) if curr_avg_bandwidth > max_avg_bandwidth: max_avg_bandwidth = curr_avg_bandwidth max_bandwidth_cluster = list(cluster) return max_bandwidth_cluster, max_avg_bandwidth