integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml (147 lines of code) (raw):
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
app_url: "https://developer.nvidia.com/dcgm"
short_name: NVIDIA DCGM
long_name: NVIDIA Data Center GPU Manager (DCGM)
logo_path: /stackdriver/images/integrations/nvidia.png # supplied by google technical writer
description: |-
The NVIDIA Data Center GPU Manager (DCGM) integration collects advanced GPU metrics,
including SM block utilization, Pipe utilization, PCIe and NVLink traffic.
configure_integration: |-
You must install DCGM and run the DCGM daemon service.
supported_operating_systems: linux
supported_app_version: ["3.1"]
gpu_platforms: # p4, p100 don't emit DCGM profiling metrics
- model: a100
platforms:
- ubuntu-os-cloud:ubuntu-2004-lts
- model: v100
platforms:
- ubuntu-os-cloud:ubuntu-2004-lts
- model: t4
platforms:
- ubuntu-os-cloud:ubuntu-2004-lts
- model: l4
platforms:
- debian-cloud:debian-11
- ml-images:common-gpu-debian-11-py310
- rocky-linux-cloud:rocky-linux-8
- rocky-linux-cloud:rocky-linux-9
- suse-cloud:sles-15
- ubuntu-os-cloud:ubuntu-2004-lts
- ubuntu-os-cloud:ubuntu-2204-lts
- model: h100
platforms:
- ubuntu-os-cloud:ubuntu-minimal-2004-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits
expected_metrics:
- type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization
value_type: DOUBLE
kind: GAUGE
description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.
monitored_resources: [gce_instance]
labels:
- &model_label
name: model
value_regex: .*
description: GPU model name.
- &uuid_label
name: uuid
value_regex: .*
description: GPU universally unique identifier.
- &gpu_number_label
name: gpu_number
value_regex: "[0-9]*"
description: GPU index starting at 0.
- &instrumentation_source_label
name: instrumentation_source
value_regex: agent.googleapis.com/dcgm
notes:
- &dcgm_profiling_note Not available on GPU models P100 and P4.
representative: true
- type: workload.googleapis.com/dcgm.gpu.profiling.sm_occupancy
value_type: DOUBLE
kind: GAUGE
description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.
monitored_resources: [gce_instance]
labels:
- *model_label
- *uuid_label
- *gpu_number_label
- *instrumentation_source_label
notes:
- *dcgm_profiling_note
- type: workload.googleapis.com/dcgm.gpu.profiling.pipe_utilization
value_type: DOUBLE
kind: GAUGE
description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.
monitored_resources: [gce_instance]
labels:
- *model_label
- *uuid_label
- *gpu_number_label
- name: pipe
value_regex: tensor|fp64|fp32|fp16
description: "GPU pipe in use, one of [tensor, fp64, fp32, fp16]."
notes:
- For L4, the `pipe` value `fp64` is not supported.
- *instrumentation_source_label
notes:
- *dcgm_profiling_note
- type: workload.googleapis.com/dcgm.gpu.profiling.dram_utilization
value_type: DOUBLE
kind: GAUGE
description: Fraction of cycles data was being sent or received from GPU memory.
monitored_resources: [gce_instance]
labels:
- *model_label
- *uuid_label
- *gpu_number_label
- *instrumentation_source_label
- type: workload.googleapis.com/dcgm.gpu.profiling.pcie_traffic_rate
value_type: INT64
kind: GAUGE
description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.
unit: By/s
monitored_resources: [gce_instance]
labels:
- *model_label
- *uuid_label
- *gpu_number_label
- &direction_label
name: direction
value_regex: tx|rx
description: "Direction of the link traffic, one of [tx, rx]."
- *instrumentation_source_label
notes:
- *dcgm_profiling_note
- type: workload.googleapis.com/dcgm.gpu.profiling.nvlink_traffic_rate
value_type: INT64
kind: GAUGE
description: The number of bytes sent over NVLink, not including protocol headers.
unit: By/s
monitored_resources: [gce_instance]
labels:
- *model_label
- *uuid_label
- *gpu_number_label
- *direction_label
- *instrumentation_source_label
notes:
- *dcgm_profiling_note
configuration_options:
metrics:
- type: dcgm
fields:
- name: type
default: null
description: This value must be `dcgm`.
- name: collection_interval
default: 60s
description: A [time duration](https://pkg.go.dev/time#ParseDuration) value, such as `30s` or `5m`.
- name: receiver_version
default: "1"
description: Either 1 or 2. Version 2 has many more metrics available.
- name: endpoint
default: localhost:5555
description: The DCGM service endpoint specified as `hostname:port`.
minimum_supported_agent_version:
metrics: 2.38.0
public_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia