integrations/dcgm/ops_agent_metadata.yaml (176 lines of code) (raw):
platforms:
- type: GCE
launch_stage: GA
version: '1'
install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia
agent_requirement:
metrics_minimum_supported_version:
major: 2
minor: 38
patch: 0
detections:
- characteristic_metric:
metric_type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization
default_metrics:
- name: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/dcgm.gpu.profiling.sm_occupancy
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/dcgm.gpu.profiling.pipe_utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- pipe
- uuid
- name: workload.googleapis.com/dcgm.gpu.profiling.dram_utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/dcgm.gpu.profiling.pcie_traffic_rate
value_type: INT64
kind: GAUGE
labels:
- direction
- gpu_number
- model
- uuid
- name: workload.googleapis.com/dcgm.gpu.profiling.nvlink_traffic_rate
value_type: INT64
kind: GAUGE
labels:
- direction
- gpu_number
- model
- uuid
- type: GCE
launch_stage: GA
version: '2'
install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia
agent_requirement:
metrics_minimum_supported_version:
major: 2
minor: 51
patch: 0
detections:
- characteristic_metric:
metric_type: workload.googleapis.com/gpu.dcgm.memory.bytes_used
default_metrics:
- name: workload.googleapis.com/gpu.dcgm.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.sm.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.pipe.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- pipe
- uuid
- name: workload.googleapis.com/gpu.dcgm.codec.encoder.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.codec.decoder.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.memory.bytes_used
value_type: INT64
kind: GAUGE
labels:
- gpu_number
- model
- state
- uuid
- name: workload.googleapis.com/gpu.dcgm.memory.bandwidth_utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.pcie.io
value_type: INT64
kind: CUMULATIVE
labels:
- direction
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.nvlink.io
value_type: INT64
kind: CUMULATIVE
labels:
- direction
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.energy_consumption
value_type: DOUBLE
kind: CUMULATIVE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.temperature
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.clock.frequency
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.clock.throttle_duration.time
value_type: DOUBLE
kind: CUMULATIVE
labels:
- gpu_number
- model
- uuid
- violation
- name: workload.googleapis.com/gpu.dcgm.ecc_errors
value_type: INT64
kind: CUMULATIVE
labels:
- error_type
- gpu_number
- model
- uuid