in components/otelopscol/receiver/dcgmreceiver/client.go [73:121]
func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, error) {
dcgmCleanup, err := initializeDcgm(settings.endpoint, logger)
if err != nil {
return nil, errors.Join(ErrDcgmInitialization, err)
}
enabledFieldGroup := dcgm.FieldHandle{}
requestedFieldIDs := toFieldIDs(settings.fields)
supportedProfilingFieldIDs, err := getSupportedProfilingFields()
if err != nil {
// If there is error querying the supported fields at all, let the
// receiver collect basic metrics: (GPU utilization, used/free memory).
logger.Sugar().Warnf("Error querying supported profiling fields on '%w'. GPU profiling metrics will not be collected.", err)
}
enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs)
for _, f := range unavailableFields {
logger.Sugar().Warnf("Field '%s' is not supported", dcgmIDToName[f])
}
var deviceGroup dcgm.GroupHandle
if len(enabledFields) != 0 {
supportedDeviceIndices, err := dcgm.GetSupportedDevices()
if err != nil {
return nil, fmt.Errorf("Unable to discover supported GPUs on %w", err)
}
logger.Sugar().Infof("Discovered %d supported GPU devices", len(supportedDeviceIndices))
deviceGroup, err = createDeviceGroup(logger, supportedDeviceIndices)
if err != nil {
return nil, err
}
enabledFieldGroup, err = setWatchesOnEnabledFields(settings.pollingInterval, logger, deviceGroup, enabledFields)
if err != nil {
_ = dcgm.FieldGroupDestroy(enabledFieldGroup)
return nil, fmt.Errorf("Unable to set field watches on %w", err)
}
}
return &dcgmClient{
logger: logger.Sugar(),
handleCleanup: dcgmCleanup,
enabledFieldIDs: enabledFields,
enabledFieldGroup: enabledFieldGroup,
deviceGroup: deviceGroup,
devices: map[uint]deviceMetrics{},
lastSuccessfulPoll: time.Now(),
deviceMetricToFailedQueryCount: make(map[string]int),
pollingInterval: settings.pollingInterval,
retryBlankValues: settings.retryBlankValues,
maxRetries: settings.maxRetries,
}, nil
}