components/otelopscol/receiver/dcgmreceiver/util.go (123 lines of code) (raw):

// Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build gpu // +build gpu package dcgmreceiver import ( "fmt" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) // For each metric, we need to track: type metricStats struct { // Timestamp (µs) // Last value (for gauge metrics), as int64 or double lastFieldValue *dcgm.FieldValue_v2 // Integrated rate (always int), as {unit-seconds,unit-microseconds} // This is intended for metrics that have a per-second unit, such as By/s. // The metric value is multiplied by the timestamp delta, producing us.By/s in integratedRateMicroseconds // When that overflows past 1e6, the overflow is put in integratedRateSeconds, which is in units of s.By/s, or just By. integratedRateSeconds int64 integratedRateMicroseconds int64 // Cumulative value (always int) initialCumulativeValue int64 cumulativeValue int64 } func asInt64(fieldValue dcgm.FieldValue_v2) (int64, bool) { // TODO: dcgm's Float64 and Int64 use undefined behavior switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: return int64(fieldValue.Float64()), true case dcgm.DCGM_FT_INT64: return fieldValue.Int64(), true } return 0, false } func asFloat64(fieldValue dcgm.FieldValue_v2) (float64, bool) { switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: return fieldValue.Float64(), true case dcgm.DCGM_FT_INT64: return float64(fieldValue.Int64()), true } return 0, false } func (m *metricStats) Update(fieldValue dcgm.FieldValue_v2) { ts := fieldValue.Ts intValue, intOk := asInt64(fieldValue) if !intOk { return } if m.lastFieldValue == nil { m.initialCumulativeValue = intValue } else { if m.lastFieldValue.Ts >= ts { return } m.cumulativeValue = intValue - m.initialCumulativeValue tsDelta := ts - m.lastFieldValue.Ts if fieldValue.FieldType == dcgm.DCGM_FT_DOUBLE { m.integratedRateMicroseconds += int64(float64(tsDelta) * fieldValue.Float64()) } else { m.integratedRateMicroseconds += tsDelta * intValue } m.integratedRateSeconds += m.integratedRateMicroseconds / 1000000 m.integratedRateMicroseconds %= 1000000 } m.lastFieldValue = &fieldValue } type MetricsMap map[string]*metricStats func (m MetricsMap) LastFloat64(name string) (float64, bool) { if metric, ok := m[name]; ok && metric.lastFieldValue != nil { return asFloat64(*metric.lastFieldValue) } return 0, false } func (m MetricsMap) LastInt64(name string) (int64, bool) { if metric, ok := m[name]; ok && metric.lastFieldValue != nil { return asInt64(*metric.lastFieldValue) } return 0, false } func (m MetricsMap) IntegratedRate(name string) (int64, bool) { if metric, ok := m[name]; ok { return metric.integratedRateSeconds, true } return 0, false } func (m MetricsMap) CumulativeTotal(name string) (int64, bool) { if metric, ok := m[name]; ok { return metric.cumulativeValue, true } return 0, false } var ( errBlankValue = fmt.Errorf("unspecified blank value") errDataNotFound = fmt.Errorf("data not found") errNotSupported = fmt.Errorf("field not supported") errPermissionDenied = fmt.Errorf("no permission to fetch value") errUnexpectedType = fmt.Errorf("unexpected data type") ) func isValidValue(fieldValue dcgm.FieldValue_v2) error { switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: switch v := fieldValue.Float64(); v { case dcgm.DCGM_FT_FP64_BLANK: return errBlankValue case dcgm.DCGM_FT_FP64_NOT_FOUND: return errDataNotFound case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: return errNotSupported case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: return errPermissionDenied } case dcgm.DCGM_FT_INT64: switch v := fieldValue.Int64(); v { case dcgm.DCGM_FT_INT32_BLANK: return errBlankValue case dcgm.DCGM_FT_INT32_NOT_FOUND: return errDataNotFound case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: return errNotSupported case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: return errPermissionDenied case dcgm.DCGM_FT_INT64_BLANK: return errBlankValue case dcgm.DCGM_FT_INT64_NOT_FOUND: return errDataNotFound case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: return errNotSupported case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: return errPermissionDenied } // dcgm.DCGM_FT_STRING also exists but we don't expect it default: return errUnexpectedType } return nil }