in pkg/gpu/nvidia/health_check/health_checker.go [179:226]
func (hc *GPUHealthChecker) catchError(e nvml.Event, cd callDevice) {
// Skip the error if it's not Xid critical
if e.Etype != nvml.XidCriticalError {
glog.Infof("Skip error Xid=%d as it is not Xid Critical", e.Edata)
return
}
// Only marking device unhealthy on Double Bit ECC Error or customer-configured codes
// See https://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
if _, ok := hc.healthCriticalXid[e.Edata]; !ok {
glog.Infof("Health checker is skipping Xid %v error", e.Edata)
return
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
glog.Errorf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
for id, d := range hc.devices {
d.Health = pluginapi.Unhealthy
hc.devices[id] = d
hc.health <- d
}
return
}
founderrordevice := false
for _, d := range hc.devices {
// Please see https://github.com/NVIDIA/gpu-monitoring-tools/blob/148415f505c96052cb3b7fdf443b34ac853139ec/bindings/go/nvml/nvml.h#L1424
// for the rationale why gi and ci can be set as such when the UUID is a full GPU UUID and not a MIG device UUID.
uuid := hc.nvmlDevices[d.ID].UUID
gpu, gi, ci, err := cd.parseMigDeviceUUID(uuid)
if err != nil {
gpu = uuid
gi = 0xFFFFFFFF
ci = 0xFFFFFFFF
}
if gpu == *e.UUID && gi == *e.GpuInstanceId && ci == *e.ComputeInstanceId {
glog.Errorf("XidCriticalError: Xid=%d on Device=%s, uuid=%s, the device will go unhealthy.", e.Edata, d.ID, uuid)
d.Health = pluginapi.Unhealthy
hc.devices[d.ID] = d
hc.health <- d
founderrordevice = true
}
}
if !founderrordevice {
glog.Errorf("XidCriticalError: Xid=%d on unknown device.", e.Edata)
}
}