func DiscoverGPUDevices()

in pkg/gpu/nvidia/metrics/devices.go [108:130]


func DiscoverGPUDevices() error {
	count, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		return fmt.Errorf("failed to get device count: %s", nvml.ErrorString(ret))
	}

	glog.Infof("Found %d GPU devices", count)
	gpuDevices = make(map[string]*nvml.Device)
	for i := int(0); i < count; i++ {
		device, ret := nvml.DeviceGetHandleByIndex(i)
		if ret != nvml.SUCCESS {
			return fmt.Errorf("failed to read device with index %d: %v", i, nvml.ErrorString(ret))
		}
		minor, ret := device.GetMinorNumber()
		if ret != nvml.SUCCESS {
			glog.Errorf("Invalid GPU device minor number found. Skipping this device")
		}
		deviceName := fmt.Sprintf("nvidia%d", minor)
		glog.Infof("Found device %s for metrics collection", deviceName)
		gpuDevices[deviceName] = &device
	}
	return nil
}