func()

in pkg/gpu/nvidia/health_check/health_checker.go [64:132]


func (hc *GPUHealthChecker) Start() error {
	glog.Info("Starting GPU Health Checker")

	for name, device := range hc.devices {
		glog.Infof("Healthchecker receives device %s, device %v+", name, device)
	}

	// Building mapping between device ID and their nvml represetation
	count, err := nvml.GetDeviceCount()
	if err != nil {
		return fmt.Errorf("failed to get device count: %s", err)
	}

	glog.Infof("Found %d GPU devices", count)
	for i := uint(0); i < count; i++ {
		device, err := nvml.NewDeviceLite(i)
		if err != nil {
			return fmt.Errorf("failed to read device with index %d: %v", i, err)
		}

		deviceName, err := util.DeviceNameFromPath(device.Path)
		if err != nil {
			glog.Errorf("Invalid GPU device path found: %s. Skipping this device", device.Path)
			continue
		}

		migEnabled, err := device.IsMigEnabled()
		if err != nil {
			glog.Errorf("Error checking if MIG is enabled on device %s. Skipping this device. Error: %v", deviceName, err)
			continue
		}

		if migEnabled {
			if err := hc.addMigEnabledDevice(deviceName, device); err != nil {
				glog.Errorf("Failed to add MIG-enabled device %s for health check. Skipping this device. Error: %v", deviceName, err)
				continue
			}
		} else {
			hc.addDevice(deviceName, device)
		}
	}

	hc.eventSet = nvml.NewEventSet()
	for _, d := range hc.nvmlDevices {
		gpu, _, _, err := nvml.ParseMigDeviceUUID(d.UUID)
		if err != nil {
			gpu = d.UUID
		}

		glog.Infof("Registering device %v. UUID: %s", d.Path, d.UUID)
		err = nvml.RegisterEventForDevice(hc.eventSet, nvml.XidCriticalError, gpu)
		if err != nil {
			if strings.HasSuffix(err.Error(), "Not Supported") {
				glog.Warningf("Warning: %s is too old to support healthchecking: %v. It will always be marked healthy.", d.Path, err)
				continue
			} else {
				return fmt.Errorf("failed to register device %s for NVML eventSet: %v", d.Path, err)
			}
		}
	}

	go func() {
		if err := hc.listenToEvents(); err != nil {
			glog.Errorf("GPUHealthChecker listenToEvents error: %v", err)
		}
	}()

	return nil
}