func()

in components/otelopscol/receiver/nvmlreceiver/client.go [354:409]


func (client *nvmlClient) collectProcessMetrics() []processMetric {
	if client.disable {
		return nil
	}

	processMetrics := make([]processMetric, 0)

	for gpuIndex, device := range client.devices {
		if !client.deviceToAccountingIsEnabled[device] {
			continue
		}

		pids, ret := nvmlDeviceGetAccountingPids(device)
		if ret != nvml.SUCCESS {
			msg := fmt.Sprintf("Unable to query cached PIDs on '%v", nvml.ErrorString(ret))
			client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "nvml.processes", msg)
			continue
		}

		for _, pid := range pids {
			metricName := fmt.Sprintf("nvml.processes{pid=%d}", pid)

			stats, ret := nvml.DeviceGetAccountingStats(device, uint32(pid))
			if ret != nvml.SUCCESS {
				msg := fmt.Sprintf("Unable to query pid %d account statistics on '%v", pid, nvml.ErrorString(ret))
				client.issueWarningForFailedQueryUptoThreshold(gpuIndex, metricName, msg)
				continue
			}

			if stats.IsRunning != 1 {
				continue
			}

			metric := processMetric{
				time:                   time.Now(),
				processPid:             pid,
				gpuIndex:               uint(gpuIndex),
				lifetimeGpuUtilization: uint64(stats.GpuUtilization),
				lifetimeGpuMaxMemory:   stats.MaxMemoryUsage,
			}

			err := metric.setMetadataLabels()
			if err != nil {
				metricName := fmt.Sprintf("nvml.processes{pid=%d}.metadata", metric.processPid)
				client.issueWarningForFailedQueryUptoThreshold(int(metric.gpuIndex), metricName, err.Error())
			}

			processMetrics = append(processMetrics, metric)

			client.logger.Debugf("Found pid %d (owner %s command %s) has used Nvidia device %d\n",
				metric.processPid, metric.owner, metric.commandLine, metric.gpuIndex)
		}
	}

	return processMetrics
}