in pkg/gpu/nvidia/metrics/metrics.go [208:239]
func (m *MetricServer) updateMetrics(containerDevices map[ContainerID][]string, gpuDevices map[string]*nvml.Device) {
m.resetMetricsIfNeeded()
for container, devices := range containerDevices {
AcceleratorRequests.WithLabelValues(container.namespace, container.pod, container.container, gpuResourceName).Set(float64(len(devices)))
for _, device := range devices {
d, err := gmc.collectGPUDevice(device)
if err != nil {
glog.Errorf("Failed to get device for %s: %v", device, err)
continue
}
mi, err := gmc.collectGpuMetricsInfo(device, d)
if err != nil {
glog.Infof("Error calculating duty cycle for device: %s: %v. Skipping this device", device, err)
continue
}
DutyCycle.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.dutyCycle))
MemoryTotal.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.totalMemory)) // memory reported in bytes
MemoryUsed.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.usedMemory)) // memory reported in bytes
}
}
for device, d := range gpuDevices {
mi, err := gmc.collectGpuMetricsInfo(device, d)
if err != nil {
glog.Infof("Error calculating duty cycle for device: %s: %v. Skipping this device", device, err)
continue
}
DutyCycleNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.dutyCycle))
MemoryTotalNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.totalMemory)) // memory reported in bytes
MemoryUsedNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.usedMemory)) // memory reported in bytes
}
}