in pkg/gpu/nvidia/metrics/devices.go [108:130]
func DiscoverGPUDevices() error {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device count: %s", nvml.ErrorString(ret))
}
glog.Infof("Found %d GPU devices", count)
gpuDevices = make(map[string]*nvml.Device)
for i := int(0); i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to read device with index %d: %v", i, nvml.ErrorString(ret))
}
minor, ret := device.GetMinorNumber()
if ret != nvml.SUCCESS {
glog.Errorf("Invalid GPU device minor number found. Skipping this device")
}
deviceName := fmt.Sprintf("nvidia%d", minor)
glog.Infof("Found device %s for metrics collection", deviceName)
gpuDevices[deviceName] = &device
}
return nil
}