func watchXIDs()

in pkg/gpu/nvidia/nvidia.go [105:167]


func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
	eventSet := nvml.NewEventSet()
	defer nvml.DeleteEventSet(eventSet)
	var physicalDeviceIDs []string

	// We don't have to loop all virtual GPUs here. Only need to check physical CPUs.
	for _, d := range devs {
		physicalDeviceID := getPhysicalDeviceID(d.ID)
		if physicialDeviceExists(physicalDeviceIDs, physicalDeviceID) {
			continue
		}
		physicalDeviceIDs = append(physicalDeviceIDs, physicalDeviceID)

		log.Printf("virtual id %s physical id %s", d.ID, physicalDeviceID)
		err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, physicalDeviceID)
		if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
			log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", physicalDeviceID, err)

			xids <- d
			continue
		}

		if err != nil {
			log.Panicln("Fatal:", err)
		}
	}

	for {
		select {
		case <-ctx.Done():
			return
		default:
		}

		e, err := nvml.WaitForEvent(eventSet, 5000)
		if err != nil && e.Etype != nvml.XidCriticalError {
			continue
		}

		// FIXME: formalize the full list and document it.
		// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
		// Application errors: the GPU should still be healthy
		if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
			continue
		}

		if e.UUID == nil || len(*e.UUID) == 0 {
			// All devices are unhealthy
			for _, d := range devs {
				log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
				xids <- d
			}
			continue
		}

		for _, d := range devs {
			if d.ID == *e.UUID {
				log.Printf("XidCriticalError: Xid=%d on GPU=%s, the device will go unhealthy.", e.Edata, d.ID)
				xids <- d
			}
		}
	}
}