func()

in components/otelopscol/receiver/dcgmreceiver/client.go [298:355]


func (client *dcgmClient) collect() (time.Duration, error) {
	client.logger.Debugf("Polling DCGM daemon for field values")
	if len(client.enabledFieldIDs) == 0 {
		// Make sure we don't try to scrape without a device group (since we don't construct one when there are no enabled fields).
		return 0, nil
	}
	fieldValues, pollTime, err := dcgmGetValuesSince(client.deviceGroup, client.enabledFieldGroup, client.lastSuccessfulPoll)
	if err != nil {
		msg := fmt.Sprintf("Unable to poll DCGM daemon for metrics: %s", err)
		client.issueWarningForFailedQueryUptoThreshold("all-profiling-metrics", maxWarningsForFailedDeviceMetricQuery, msg)
		return 0, err
	}
	client.logger.Debugf("Got %d field values over %s", len(fieldValues), pollTime.Sub(client.lastSuccessfulPoll))
	client.lastSuccessfulPoll = pollTime
	oldestTs := int64(math.MaxInt64)
	newestTs := int64(0)
	for _, fieldValue := range fieldValues {
		if fieldValue.EntityGroupId != dcgm.FE_GPU {
			continue
		}
		gpuIndex := fieldValue.EntityId
		if _, ok := client.devices[gpuIndex]; !ok {
			device, err := newDeviceMetrics(client.logger, gpuIndex)
			if err != nil {
				continue
			}
			client.devices[gpuIndex] = device
		}
		device := client.devices[gpuIndex]
		dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)]
		if err := isValidValue(fieldValue); err == errBlankValue {
			// Blank values are expected at startup.
			continue
		} else if err == errNotSupported {
			client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported", dcgmName))
			continue
		} else if err != nil {
			msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err)
			client.issueWarningForFailedQueryUptoThreshold(fmt.Sprintf("device%d.%s", gpuIndex, dcgmName), maxWarningsForFailedDeviceMetricQuery, msg)
			continue
		}
		if fieldValue.Ts < oldestTs {
			oldestTs = fieldValue.Ts
		}
		if fieldValue.Ts > newestTs {
			newestTs = fieldValue.Ts
		}
		if _, ok := device.Metrics[dcgmName]; !ok {
			device.Metrics[dcgmName] = &metricStats{}
		}
		device.Metrics[dcgmName].Update(fieldValue)
	}
	duration := time.Duration(newestTs-oldestTs) * time.Microsecond
	client.logger.Debugf("Successful poll of DCGM daemon returned %v of data", duration)
	// If we did a partial poll, there should be more room in the buffer.
	duration = max(duration, client.pollingInterval*maxKeepSamples)
	return duration, nil
}