integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml (147 lines of code) (raw):

# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. app_url: "https://developer.nvidia.com/dcgm" short_name: NVIDIA DCGM long_name: NVIDIA Data Center GPU Manager (DCGM) logo_path: /stackdriver/images/integrations/nvidia.png # supplied by google technical writer description: |- The NVIDIA Data Center GPU Manager (DCGM) integration collects advanced GPU metrics, including SM block utilization, Pipe utilization, PCIe and NVLink traffic. configure_integration: |- You must install DCGM and run the DCGM daemon service. supported_operating_systems: linux supported_app_version: ["3.1"] gpu_platforms: # p4, p100 don't emit DCGM profiling metrics - model: a100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: v100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: t4 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: l4 platforms: - debian-cloud:debian-11 - ml-images:common-gpu-debian-11-py310 - rocky-linux-cloud:rocky-linux-8 - rocky-linux-cloud:rocky-linux-9 - suse-cloud:sles-15 - ubuntu-os-cloud:ubuntu-2004-lts - ubuntu-os-cloud:ubuntu-2204-lts - model: h100 platforms: - ubuntu-os-cloud:ubuntu-minimal-2004-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits expected_metrics: - type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization value_type: DOUBLE kind: GAUGE description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. monitored_resources: [gce_instance] labels: - &model_label name: model value_regex: .* description: GPU model name. - &uuid_label name: uuid value_regex: .* description: GPU universally unique identifier. - &gpu_number_label name: gpu_number value_regex: "[0-9]*" description: GPU index starting at 0. - &instrumentation_source_label name: instrumentation_source value_regex: agent.googleapis.com/dcgm notes: - &dcgm_profiling_note Not available on GPU models P100 and P4. representative: true - type: workload.googleapis.com/dcgm.gpu.profiling.sm_occupancy value_type: DOUBLE kind: GAUGE description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - *instrumentation_source_label notes: - *dcgm_profiling_note - type: workload.googleapis.com/dcgm.gpu.profiling.pipe_utilization value_type: DOUBLE kind: GAUGE description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - name: pipe value_regex: tensor|fp64|fp32|fp16 description: "GPU pipe in use, one of [tensor, fp64, fp32, fp16]." notes: - For L4, the `pipe` value `fp64` is not supported. - *instrumentation_source_label notes: - *dcgm_profiling_note - type: workload.googleapis.com/dcgm.gpu.profiling.dram_utilization value_type: DOUBLE kind: GAUGE description: Fraction of cycles data was being sent or received from GPU memory. monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - *instrumentation_source_label - type: workload.googleapis.com/dcgm.gpu.profiling.pcie_traffic_rate value_type: INT64 kind: GAUGE description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. unit: By/s monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - &direction_label name: direction value_regex: tx|rx description: "Direction of the link traffic, one of [tx, rx]." - *instrumentation_source_label notes: - *dcgm_profiling_note - type: workload.googleapis.com/dcgm.gpu.profiling.nvlink_traffic_rate value_type: INT64 kind: GAUGE description: The number of bytes sent over NVLink, not including protocol headers. unit: By/s monitored_resources: [gce_instance] labels: - *model_label - *uuid_label - *gpu_number_label - *direction_label - *instrumentation_source_label notes: - *dcgm_profiling_note configuration_options: metrics: - type: dcgm fields: - name: type default: null description: This value must be `dcgm`. - name: collection_interval default: 60s description: A [time duration](https://pkg.go.dev/time#ParseDuration) value, such as `30s` or `5m`. - name: receiver_version default: "1" description: Either 1 or 2. Version 2 has many more metrics available. - name: endpoint default: localhost:5555 description: The DCGM service endpoint specified as `hostname:port`. minimum_supported_agent_version: metrics: 2.38.0 public_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia