in components/otelopscol/receiver/nvmlreceiver/client.go [160:209]
func discoverDevices(logger *zap.Logger) ([]nvml.Device, []string, []string, error) {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return nil, nil, nil, fmt.Errorf("Unable to get Nvidia device count on '%v'", nvml.ErrorString(ret))
}
devices := make([]nvml.Device, 0, count)
names := make([]string, 0, count)
UUIDs := make([]string, 0, count)
for i := 0; i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
logger.Sugar().Warnf("Unable to get Nvidia device at index %d on '%v'; ignoring device.", i, nvml.ErrorString(ret))
continue
}
/* Note: UUID and Name query should not fail under normal circumstances */
UUID, ret := device.GetUUID()
if ret != nvml.SUCCESS {
logger.Sugar().Warnf("Unable to get UUID of Nvidia device %d on '%v'; ignoring device.", i, nvml.ErrorString(ret))
continue
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
logger.Sugar().Warnf("Unable to get name of Nvidia device %d on '%v'; ignoring device.", i, nvml.ErrorString(ret))
continue
}
devices = append(devices, device)
UUIDs = append(UUIDs, UUID)
names = append(names, name)
logger.Sugar().Infof("Discovered Nvidia device %d of model %s with UUID %s.", i, name, UUID)
currMode, _, ret := device.GetMigMode()
if ret != nvml.SUCCESS {
logger.Sugar().Warnf("Unable to query MIG mode for Nvidia device %d.", i)
continue
}
if currMode == nvml.DEVICE_MIG_ENABLE {
logger.Sugar().Warnf("Nvidia device %d has MIG enabled. GPU utilization queries may not be supported.", i)
}
}
if len(devices) == 0 {
return nil, nil, nil, fmt.Errorf("No supported NVIDIA devices found")
}
return devices, names, UUIDs, nil
}