in pkg/gpu/nvidia/health_check/health_checker.go [144:168]
func (hc *GPUHealthChecker) addMigEnabledDevice(deviceName string, device *nvml.Device) error {
glog.Infof("HealthChecker detects MIG is enabled on device %s", deviceName)
migs, err := device.GetMigDevices()
if err != nil {
return fmt.Errorf("error getting MIG devices on device %s. err: %v.", deviceName, err)
}
for _, mig := range migs {
gpu, gi, _, err := nvml.ParseMigDeviceUUID(mig.UUID)
if err != nil {
return fmt.Errorf("error parsing MIG UUID on device %s, MIG UUID: %s, error %v", gpu, mig.UUID, err)
}
migDeviceName := fmt.Sprintf("%s/gi%d", deviceName, gi)
if _, ok := hc.devices[migDeviceName]; !ok {
// Only monitor the devices passed in
glog.Warningf("Ignoring device %s for health check.", migDeviceName)
continue
}
glog.Infof("Found mig device %s for health monitoring. UUID: %s", migDeviceName, mig.UUID)
hc.nvmlDevices[migDeviceName] = mig
}
return nil
}