in pkg/gpu/nvidia/health_check/health_checker.go [64:132]
func (hc *GPUHealthChecker) Start() error {
glog.Info("Starting GPU Health Checker")
for name, device := range hc.devices {
glog.Infof("Healthchecker receives device %s, device %v+", name, device)
}
// Building mapping between device ID and their nvml represetation
count, err := nvml.GetDeviceCount()
if err != nil {
return fmt.Errorf("failed to get device count: %s", err)
}
glog.Infof("Found %d GPU devices", count)
for i := uint(0); i < count; i++ {
device, err := nvml.NewDeviceLite(i)
if err != nil {
return fmt.Errorf("failed to read device with index %d: %v", i, err)
}
deviceName, err := util.DeviceNameFromPath(device.Path)
if err != nil {
glog.Errorf("Invalid GPU device path found: %s. Skipping this device", device.Path)
continue
}
migEnabled, err := device.IsMigEnabled()
if err != nil {
glog.Errorf("Error checking if MIG is enabled on device %s. Skipping this device. Error: %v", deviceName, err)
continue
}
if migEnabled {
if err := hc.addMigEnabledDevice(deviceName, device); err != nil {
glog.Errorf("Failed to add MIG-enabled device %s for health check. Skipping this device. Error: %v", deviceName, err)
continue
}
} else {
hc.addDevice(deviceName, device)
}
}
hc.eventSet = nvml.NewEventSet()
for _, d := range hc.nvmlDevices {
gpu, _, _, err := nvml.ParseMigDeviceUUID(d.UUID)
if err != nil {
gpu = d.UUID
}
glog.Infof("Registering device %v. UUID: %s", d.Path, d.UUID)
err = nvml.RegisterEventForDevice(hc.eventSet, nvml.XidCriticalError, gpu)
if err != nil {
if strings.HasSuffix(err.Error(), "Not Supported") {
glog.Warningf("Warning: %s is too old to support healthchecking: %v. It will always be marked healthy.", d.Path, err)
continue
} else {
return fmt.Errorf("failed to register device %s for NVML eventSet: %v", d.Path, err)
}
}
}
go func() {
if err := hc.listenToEvents(); err != nil {
glog.Errorf("GPUHealthChecker listenToEvents error: %v", err)
}
}()
return nil
}