in components/otelopscol/receiver/nvmlreceiver/client.go [354:409]
func (client *nvmlClient) collectProcessMetrics() []processMetric {
if client.disable {
return nil
}
processMetrics := make([]processMetric, 0)
for gpuIndex, device := range client.devices {
if !client.deviceToAccountingIsEnabled[device] {
continue
}
pids, ret := nvmlDeviceGetAccountingPids(device)
if ret != nvml.SUCCESS {
msg := fmt.Sprintf("Unable to query cached PIDs on '%v", nvml.ErrorString(ret))
client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "nvml.processes", msg)
continue
}
for _, pid := range pids {
metricName := fmt.Sprintf("nvml.processes{pid=%d}", pid)
stats, ret := nvml.DeviceGetAccountingStats(device, uint32(pid))
if ret != nvml.SUCCESS {
msg := fmt.Sprintf("Unable to query pid %d account statistics on '%v", pid, nvml.ErrorString(ret))
client.issueWarningForFailedQueryUptoThreshold(gpuIndex, metricName, msg)
continue
}
if stats.IsRunning != 1 {
continue
}
metric := processMetric{
time: time.Now(),
processPid: pid,
gpuIndex: uint(gpuIndex),
lifetimeGpuUtilization: uint64(stats.GpuUtilization),
lifetimeGpuMaxMemory: stats.MaxMemoryUsage,
}
err := metric.setMetadataLabels()
if err != nil {
metricName := fmt.Sprintf("nvml.processes{pid=%d}.metadata", metric.processPid)
client.issueWarningForFailedQueryUptoThreshold(int(metric.gpuIndex), metricName, err.Error())
}
processMetrics = append(processMetrics, metric)
client.logger.Debugf("Found pid %d (owner %s command %s) has used Nvidia device %d\n",
metric.processPid, metric.owner, metric.commandLine, metric.gpuIndex)
}
}
return processMetrics
}