func()

in pkg/gpu/nvidia/metrics/metrics.go [208:239]


func (m *MetricServer) updateMetrics(containerDevices map[ContainerID][]string, gpuDevices map[string]*nvml.Device) {
	m.resetMetricsIfNeeded()
	for container, devices := range containerDevices {
		AcceleratorRequests.WithLabelValues(container.namespace, container.pod, container.container, gpuResourceName).Set(float64(len(devices)))
		for _, device := range devices {
			d, err := gmc.collectGPUDevice(device)
			if err != nil {
				glog.Errorf("Failed to get device for %s: %v", device, err)
				continue
			}
			mi, err := gmc.collectGpuMetricsInfo(device, d)
			if err != nil {
				glog.Infof("Error calculating duty cycle for device: %s: %v. Skipping this device", device, err)
				continue
			}
			DutyCycle.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.dutyCycle))
			MemoryTotal.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.totalMemory)) // memory reported in bytes
			MemoryUsed.WithLabelValues(container.namespace, container.pod, container.container, "nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.usedMemory))   // memory reported in bytes
		}
	}
	for device, d := range gpuDevices {
		mi, err := gmc.collectGpuMetricsInfo(device, d)
		if err != nil {
			glog.Infof("Error calculating duty cycle for device: %s: %v. Skipping this device", device, err)
			continue
		}

		DutyCycleNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.dutyCycle))
		MemoryTotalNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.totalMemory)) // memory reported in bytes
		MemoryUsedNodeGpu.WithLabelValues("nvidia", mi.uuid, mi.deviceModel).Set(float64(mi.usedMemory))   // memory reported in bytes
	}
}