in components/otelopscol/receiver/dcgmreceiver/client.go [298:355]
func (client *dcgmClient) collect() (time.Duration, error) {
client.logger.Debugf("Polling DCGM daemon for field values")
if len(client.enabledFieldIDs) == 0 {
// Make sure we don't try to scrape without a device group (since we don't construct one when there are no enabled fields).
return 0, nil
}
fieldValues, pollTime, err := dcgmGetValuesSince(client.deviceGroup, client.enabledFieldGroup, client.lastSuccessfulPoll)
if err != nil {
msg := fmt.Sprintf("Unable to poll DCGM daemon for metrics: %s", err)
client.issueWarningForFailedQueryUptoThreshold("all-profiling-metrics", maxWarningsForFailedDeviceMetricQuery, msg)
return 0, err
}
client.logger.Debugf("Got %d field values over %s", len(fieldValues), pollTime.Sub(client.lastSuccessfulPoll))
client.lastSuccessfulPoll = pollTime
oldestTs := int64(math.MaxInt64)
newestTs := int64(0)
for _, fieldValue := range fieldValues {
if fieldValue.EntityGroupId != dcgm.FE_GPU {
continue
}
gpuIndex := fieldValue.EntityId
if _, ok := client.devices[gpuIndex]; !ok {
device, err := newDeviceMetrics(client.logger, gpuIndex)
if err != nil {
continue
}
client.devices[gpuIndex] = device
}
device := client.devices[gpuIndex]
dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)]
if err := isValidValue(fieldValue); err == errBlankValue {
// Blank values are expected at startup.
continue
} else if err == errNotSupported {
client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported", dcgmName))
continue
} else if err != nil {
msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err)
client.issueWarningForFailedQueryUptoThreshold(fmt.Sprintf("device%d.%s", gpuIndex, dcgmName), maxWarningsForFailedDeviceMetricQuery, msg)
continue
}
if fieldValue.Ts < oldestTs {
oldestTs = fieldValue.Ts
}
if fieldValue.Ts > newestTs {
newestTs = fieldValue.Ts
}
if _, ok := device.Metrics[dcgmName]; !ok {
device.Metrics[dcgmName] = &metricStats{}
}
device.Metrics[dcgmName].Update(fieldValue)
}
duration := time.Duration(newestTs-oldestTs) * time.Microsecond
client.logger.Debugf("Successful poll of DCGM daemon returned %v of data", duration)
// If we did a partial poll, there should be more room in the buffer.
duration = max(duration, client.pollingInterval*maxKeepSamples)
return duration, nil
}