func()

in pkg/gpu/nvidia/health_check/health_checker.go [179:226]


func (hc *GPUHealthChecker) catchError(e nvml.Event, cd callDevice) {
	// Skip the error if it's not Xid critical
	if e.Etype != nvml.XidCriticalError {
		glog.Infof("Skip error Xid=%d as it is not Xid Critical", e.Edata)
		return
	}
	// Only marking device unhealthy on Double Bit ECC Error or customer-configured codes
	// See https://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
	if _, ok := hc.healthCriticalXid[e.Edata]; !ok {
		glog.Infof("Health checker is skipping Xid %v error", e.Edata)
		return
	}

	if e.UUID == nil || len(*e.UUID) == 0 {
		// All devices are unhealthy
		glog.Errorf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
		for id, d := range hc.devices {
			d.Health = pluginapi.Unhealthy
			hc.devices[id] = d
			hc.health <- d
		}
		return
	}

	founderrordevice := false
	for _, d := range hc.devices {
		// Please see https://github.com/NVIDIA/gpu-monitoring-tools/blob/148415f505c96052cb3b7fdf443b34ac853139ec/bindings/go/nvml/nvml.h#L1424
		// for the rationale why gi and ci can be set as such when the UUID is a full GPU UUID and not a MIG device UUID.
		uuid := hc.nvmlDevices[d.ID].UUID
		gpu, gi, ci, err := cd.parseMigDeviceUUID(uuid)
		if err != nil {
			gpu = uuid
			gi = 0xFFFFFFFF
			ci = 0xFFFFFFFF
		}

		if gpu == *e.UUID && gi == *e.GpuInstanceId && ci == *e.ComputeInstanceId {
			glog.Errorf("XidCriticalError: Xid=%d on Device=%s, uuid=%s, the device will go unhealthy.", e.Edata, d.ID, uuid)
			d.Health = pluginapi.Unhealthy
			hc.devices[d.ID] = d
			hc.health <- d
			founderrordevice = true
		}
	}
	if !founderrordevice {
		glog.Errorf("XidCriticalError: Xid=%d on unknown device.", e.Edata)
	}
}