components/otelopscol/receiver/dcgmreceiver/metadata.yaml (158 lines of code) (raw):
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
type: dcgm
status:
class: receiver
stability:
beta: [metrics]
resource_attributes:
gpu.number:
type: string
description: GPU index starting at 0.
enabled: true
gpu.uuid:
type: string
description: GPU universally unique identifier.
enabled: true
gpu.model:
type: string
description: GPU model name.
enabled: true
attributes:
gpu.memory.state:
type: string
description: GPU memory state, one of [free, used, reserved].
enum: [used, free, reserved]
gpu.pipe:
type: string
description: GPU pipe in use, one of [tensor, fp64, fp32, fp16].
enum: [tensor, fp64, fp32, fp16]
network.io.direction:
type: string
description: Direction of the link traffic, one of [transmit, receive].
enum: [transmit, receive]
gpu.clock.violation:
type: string
description: Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock].
enum: [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]
gpu.error.type:
type: string
description: The type of error, one of [sbe, dbe].
enum: [sbe, dbe]
gpu.error.xid:
type: int
description: The XID code for the error, 1..143.
metrics:
gpu.dcgm.utilization:
description: Ratio of time the graphics engine is active.
unit: "1"
gauge:
value_type: double
enabled: true
gpu.dcgm.sm.utilization:
description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.
unit: "1"
gauge:
value_type: double
enabled: true
gpu.dcgm.sm.occupancy:
description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.
unit: "1"
gauge:
value_type: double
enabled: false
gpu.dcgm.pipe.utilization:
description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.
unit: "1"
gauge:
value_type: double
attributes: [gpu.pipe]
enabled: true
gpu.dcgm.codec.encoder.utilization:
description: Encoder utilization.
unit: "1"
gauge:
value_type: double
enabled: true
gpu.dcgm.codec.decoder.utilization:
description: Decoder utilization.
unit: "1"
gauge:
value_type: double
enabled: true
gpu.dcgm.memory.bytes_used:
description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.
unit: By
gauge:
value_type: int
attributes: [gpu.memory.state]
enabled: true
gpu.dcgm.memory.bandwidth_utilization:
description: Fraction of cycles data was being sent or received from GPU memory.
unit: "1"
gauge:
value_type: double
enabled: true
gpu.dcgm.pcie.io:
description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.
unit: By
sum:
value_type: int
aggregation_temporality: cumulative
monotonic: true
attributes: [network.io.direction]
enabled: true
gpu.dcgm.nvlink.io:
description: The number of bytes sent over NVLink, not including protocol headers.
unit: By
sum:
value_type: int
aggregation_temporality: cumulative
monotonic: true
attributes: [network.io.direction]
enabled: true
gpu.dcgm.energy_consumption:
description: Total energy consumption for the GPU in J since the driver was last reloaded.
unit: J
sum:
value_type: double
aggregation_temporality: cumulative
monotonic: true
enabled: true
gpu.dcgm.temperature:
description: Current temperature readings for the device, in ˚C.
unit: Cel
gauge:
value_type: double
enabled: true
gpu.dcgm.clock.frequency:
description: Multiprocessor clock frequency.
unit: Hz
gauge:
value_type: double
enabled: true
gpu.dcgm.clock.throttle_duration.time:
description: Clock throttle total duration.
unit: s
sum:
value_type: double
aggregation_temporality: cumulative
monotonic: true
attributes: [gpu.clock.violation]
enabled: true
gpu.dcgm.ecc_errors:
description: Data corruption errors.
unit: "1"
sum:
value_type: int
aggregation_temporality: cumulative
monotonic: true
attributes: [gpu.error.type]
enabled: true
gpu.dcgm.xid_errors:
description: XID errors.
unit: "1"
sum:
value_type: int
aggregation_temporality: cumulative
monotonic: true
attributes: [gpu.error.xid]
enabled: false