integration_test/third_party_apps_test/applications/dcgm/metadata.yaml

# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. app_url: "https://developer.nvidia.com/dcgm" short_name: NVIDIA DCGM long_name: NVIDIA Data Center GPU Manager (DCGM) logo_path: /stackdriver/images/integrations/nvidia.png # supplied by google technical writer description: |- The NVIDIA Data Center GPU Manager (DCGM) integration collects advanced GPU metrics, including SM block utilization, Pipe utilization, PCIe and NVLink traffic. configure_integration: |- You must install DCGM and run the DCGM daemon service. supported_operating_systems: linux supported_app_version: ["3.1"] gpu_platforms: # p4, p100 don't emit DCGM profiling metrics - model: a100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: v100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: t4 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: l4 platforms: - debian-cloud:debian-11 - ml-images:common-gpu-debian-11-py310 - rocky-linux-cloud:rocky-linux-8 - rocky-linux-cloud:rocky-linux-9 - suse-cloud:sles-15 - ubuntu-os-cloud:ubuntu-2004-lts - ubuntu-os-cloud:ubuntu-2204-lts - model: h100 platforms: - ubuntu-os-cloud:ubuntu-minimal-2004-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits expected_metrics: - type: workload.googleapis.com/gpu.dcgm.utilization value_type: DOUBLE kind: GAUGE description: Ratio of time the graphics engine is active. monitored_resources: [gce_instance] labels: - &model_label name: model value_regex: .* description: GPU model name. - &uuid_label name: uuid value_regex: .* description: GPU universally unique identifier. - &gpu_number_label name: gpu_number value_regex: "[0-9]*" description: GPU index starting at 0. - type: workload.googleapis.com/gpu.dcgm.sm.utilization value_type: DOUBLE kind: GAUGE description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label notes: - &dcgm_profiling_note Not available on GPU models P100 and P4. - type: workload.googleapis.com/gpu.dcgm.pipe.utilization value_type: DOUBLE kind: GAUGE description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - name: pipe value_regex: tensor|fp64|fp32|fp16 description: "GPU pipe in use, one of [tensor, fp64, fp32, fp16]." notes: - For L4, the `pipe` value `fp64` is not supported. notes: - *dcgm_profiling_note - type: workload.googleapis.com/gpu.dcgm.codec.encoder.utilization value_type: DOUBLE kind: GAUGE description: Encoder utilization. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.codec.decoder.utilization value_type: DOUBLE kind: GAUGE description: Decoder utilization. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.memory.bytes_used value_type: INT64 kind: GAUGE description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. unit: By monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - name: state value_regex: free|used|reserved description: "GPU memory state, one of [free, used, reserved]." representative: true - type: workload.googleapis.com/gpu.dcgm.memory.bandwidth_utilization value_type: DOUBLE kind: GAUGE description: Fraction of cycles data was being sent or received from GPU memory. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.pcie.io value_type: INT64 kind: CUMULATIVE description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. unit: By monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - &direction_label name: direction value_regex: transmit|receive description: "Direction of the link traffic, one of [transmit, receive]." notes: - *dcgm_profiling_note - type: workload.googleapis.com/gpu.dcgm.nvlink.io value_type: INT64 kind: CUMULATIVE description: The number of bytes sent over NVLink, not including protocol headers. unit: By monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - *direction_label notes: - *dcgm_profiling_note - type: workload.googleapis.com/gpu.dcgm.energy_consumption value_type: DOUBLE kind: CUMULATIVE description: Total energy consumption for the GPU in J since the driver was last reloaded. unit: J monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.temperature value_type: DOUBLE kind: GAUGE description: Current temperature readings for the device, in ˚C. unit: Cel monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.clock.frequency value_type: DOUBLE kind: GAUGE description: Multiprocessor clock frequency. unit: Hz monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - type: workload.googleapis.com/gpu.dcgm.clock.throttle_duration.time value_type: DOUBLE kind: CUMULATIVE description: Clock throttle total duration. unit: s monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - name: violation value_regex: power|thermal|sync_boost|board_limit|low_util|reliability|app_clock|base_clock description: "Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]." notes: - For P100 and P4, only `violation` values `power`, `thermal`, and `sync_boost` are supported. - type: workload.googleapis.com/gpu.dcgm.ecc_errors value_type: INT64 kind: CUMULATIVE description: Data corruption errors. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - name: error_type value_regex: sbe|dbe description: "The type of error, one of [sbe, dbe]." configuration_options: metrics: - type: dcgm fields: - name: type default: null description: This value must be `dcgm`. - name: collection_interval default: 60s description: A [time duration](https://pkg.go.dev/time#ParseDuration) value, such as `30s` or `5m`. - name: receiver_version default: "1" description: Either 1 or 2. Version 2 has many more metrics available. - name: endpoint default: localhost:5555 description: The DCGM service endpoint specified as `hostname:port`. minimum_supported_agent_version: metrics: 2.38.0 public_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia

integration_test/third_party_apps_test/applications/dcgm/metadata.yaml (225 lines of code) (raw):