pkg/gpu/nvidia/health_check/health_checker.go (189 lines of code) (raw):
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package healthcheck
import (
"fmt"
"strings"
"github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/util"
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/golang/glog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// GPUHealthChecker checks the health of nvidia GPUs. Note that with the current
// device naming pattern in device manager, GPUHealthChecker will not work with
// MIG devices.
type GPUHealthChecker struct {
devices map[string]pluginapi.Device
nvmlDevices map[string]*nvml.Device
health chan pluginapi.Device
eventSet nvml.EventSet
stop chan bool
healthCriticalXid map[uint64]bool
}
// NewGPUHealthChecker returns a GPUHealthChecker object for a given device name
func NewGPUHealthChecker(devices map[string]pluginapi.Device, health chan pluginapi.Device, codes []int) *GPUHealthChecker {
hc := &GPUHealthChecker{
devices: make(map[string]pluginapi.Device),
nvmlDevices: make(map[string]*nvml.Device),
health: health,
stop: make(chan bool),
healthCriticalXid: make(map[uint64]bool),
}
// Cloning the device map to avoid interfering with the device manager
for id, d := range devices {
hc.devices[id] = d
}
for _, c := range codes {
glog.Infof("reading code %v", c)
hc.healthCriticalXid[uint64(c)] = true
}
// By default, we check Double Bit ECC Error
hc.healthCriticalXid[48] = true
return hc
}
// Start registers NVML events and starts listening to them
func (hc *GPUHealthChecker) Start() error {
glog.Info("Starting GPU Health Checker")
for name, device := range hc.devices {
glog.Infof("Healthchecker receives device %s, device %v+", name, device)
}
// Building mapping between device ID and their nvml represetation
count, err := nvml.GetDeviceCount()
if err != nil {
return fmt.Errorf("failed to get device count: %s", err)
}
glog.Infof("Found %d GPU devices", count)
for i := uint(0); i < count; i++ {
device, err := nvml.NewDeviceLite(i)
if err != nil {
return fmt.Errorf("failed to read device with index %d: %v", i, err)
}
deviceName, err := util.DeviceNameFromPath(device.Path)
if err != nil {
glog.Errorf("Invalid GPU device path found: %s. Skipping this device", device.Path)
continue
}
migEnabled, err := device.IsMigEnabled()
if err != nil {
glog.Errorf("Error checking if MIG is enabled on device %s. Skipping this device. Error: %v", deviceName, err)
continue
}
if migEnabled {
if err := hc.addMigEnabledDevice(deviceName, device); err != nil {
glog.Errorf("Failed to add MIG-enabled device %s for health check. Skipping this device. Error: %v", deviceName, err)
continue
}
} else {
hc.addDevice(deviceName, device)
}
}
hc.eventSet = nvml.NewEventSet()
for _, d := range hc.nvmlDevices {
gpu, _, _, err := nvml.ParseMigDeviceUUID(d.UUID)
if err != nil {
gpu = d.UUID
}
glog.Infof("Registering device %v. UUID: %s", d.Path, d.UUID)
err = nvml.RegisterEventForDevice(hc.eventSet, nvml.XidCriticalError, gpu)
if err != nil {
if strings.HasSuffix(err.Error(), "Not Supported") {
glog.Warningf("Warning: %s is too old to support healthchecking: %v. It will always be marked healthy.", d.Path, err)
continue
} else {
return fmt.Errorf("failed to register device %s for NVML eventSet: %v", d.Path, err)
}
}
}
go func() {
if err := hc.listenToEvents(); err != nil {
glog.Errorf("GPUHealthChecker listenToEvents error: %v", err)
}
}()
return nil
}
func (hc *GPUHealthChecker) addDevice(deviceName string, device *nvml.Device) {
if _, ok := hc.devices[deviceName]; !ok {
// Only monitor the devices passed in
glog.Warningf("Ignoring device %s for health check.", deviceName)
return
}
glog.Infof("Found non-mig device %s for health monitoring. UUID: %s", deviceName, device.UUID)
hc.nvmlDevices[deviceName] = device
}
func (hc *GPUHealthChecker) addMigEnabledDevice(deviceName string, device *nvml.Device) error {
glog.Infof("HealthChecker detects MIG is enabled on device %s", deviceName)
migs, err := device.GetMigDevices()
if err != nil {
return fmt.Errorf("error getting MIG devices on device %s. err: %v.", deviceName, err)
}
for _, mig := range migs {
gpu, gi, _, err := nvml.ParseMigDeviceUUID(mig.UUID)
if err != nil {
return fmt.Errorf("error parsing MIG UUID on device %s, MIG UUID: %s, error %v", gpu, mig.UUID, err)
}
migDeviceName := fmt.Sprintf("%s/gi%d", deviceName, gi)
if _, ok := hc.devices[migDeviceName]; !ok {
// Only monitor the devices passed in
glog.Warningf("Ignoring device %s for health check.", migDeviceName)
continue
}
glog.Infof("Found mig device %s for health monitoring. UUID: %s", migDeviceName, mig.UUID)
hc.nvmlDevices[migDeviceName] = mig
}
return nil
}
type callDevice interface {
parseMigDeviceUUID(UUID string) (string, uint, uint, error)
}
type GPUDevice struct{}
func (gd *GPUDevice) parseMigDeviceUUID(UUID string) (string, uint, uint, error) {
return nvml.ParseMigDeviceUUID(UUID)
}
func (hc *GPUHealthChecker) catchError(e nvml.Event, cd callDevice) {
// Skip the error if it's not Xid critical
if e.Etype != nvml.XidCriticalError {
glog.Infof("Skip error Xid=%d as it is not Xid Critical", e.Edata)
return
}
// Only marking device unhealthy on Double Bit ECC Error or customer-configured codes
// See https://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
if _, ok := hc.healthCriticalXid[e.Edata]; !ok {
glog.Infof("Health checker is skipping Xid %v error", e.Edata)
return
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
glog.Errorf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata)
for id, d := range hc.devices {
d.Health = pluginapi.Unhealthy
hc.devices[id] = d
hc.health <- d
}
return
}
founderrordevice := false
for _, d := range hc.devices {
// Please see https://github.com/NVIDIA/gpu-monitoring-tools/blob/148415f505c96052cb3b7fdf443b34ac853139ec/bindings/go/nvml/nvml.h#L1424
// for the rationale why gi and ci can be set as such when the UUID is a full GPU UUID and not a MIG device UUID.
uuid := hc.nvmlDevices[d.ID].UUID
gpu, gi, ci, err := cd.parseMigDeviceUUID(uuid)
if err != nil {
gpu = uuid
gi = 0xFFFFFFFF
ci = 0xFFFFFFFF
}
if gpu == *e.UUID && gi == *e.GpuInstanceId && ci == *e.ComputeInstanceId {
glog.Errorf("XidCriticalError: Xid=%d on Device=%s, uuid=%s, the device will go unhealthy.", e.Edata, d.ID, uuid)
d.Health = pluginapi.Unhealthy
hc.devices[d.ID] = d
hc.health <- d
founderrordevice = true
}
}
if !founderrordevice {
glog.Errorf("XidCriticalError: Xid=%d on unknown device.", e.Edata)
}
}
// listenToEvents listens to events from NVML to detect GPU critical errors
func (hc *GPUHealthChecker) listenToEvents() error {
for {
select {
case <-hc.stop:
close(hc.stop)
return nil
default:
}
e, err := nvml.WaitForEvent(hc.eventSet, 5000)
if err != nil {
continue
}
gd := GPUDevice{}
hc.catchError(e, &gd)
}
}
// Stop deletes the NVML events and stops the listening go routine
func (hc *GPUHealthChecker) Stop() {
nvml.DeleteEventSet(hc.eventSet)
hc.stop <- true
<-hc.stop
}