in pkg/gpu/nvidia/nvidia.go [105:167]
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
eventSet := nvml.NewEventSet()
defer nvml.DeleteEventSet(eventSet)
var physicalDeviceIDs []string
// We don't have to loop all virtual GPUs here. Only need to check physical CPUs.
for _, d := range devs {
physicalDeviceID := getPhysicalDeviceID(d.ID)
if physicialDeviceExists(physicalDeviceIDs, physicalDeviceID) {
continue
}
physicalDeviceIDs = append(physicalDeviceIDs, physicalDeviceID)
log.Printf("virtual id %s physical id %s", d.ID, physicalDeviceID)
err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, physicalDeviceID)
if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, err)
xids <- d
continue
}
if err != nil {
log.Panicln("Fatal:", err)
}
}
for {
select {
case <-ctx.Done():
return
default:
}
e, err := nvml.WaitForEvent(eventSet, 5000)
if err != nil && e.Etype != nvml.XidCriticalError {
continue
}
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
continue
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
for _, d := range devs {
log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
xids <- d
}
continue
}
for _, d := range devs {
if d.ID == *e.UUID {
log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.Edata, d.ID)
xids <- d
}
}
}
}