integration_test/third_party_apps_test/applications/nvml/metadata.yaml (120 lines of code) (raw):

# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. app_url: "https://developer.nvidia.com/nvidia-management-library-nvml" short_name: nvml long_name: nvml logo_path: /images/partners/todo.png # supplied by google technical writer description: |- The NVIDIA Management Library (or NVML) integration collects GPU utilization and GPU used memory metrics from NVIDIA GPUs. configure_integration: |- You must install the NVIDIA driver on a host with NVIDIA GPUs. supported_operating_systems: linux supported_app_version: ["515.65.01"] gpu_platforms: - model: a100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: v100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: p4 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: t4 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - ml-images:common-gpu-debian-11-py310 - model: p100 platforms: - ubuntu-os-cloud:ubuntu-2004-lts - model: l4 platforms: - debian-cloud:debian-11 - ml-images:common-gpu-debian-11-py310 - rocky-linux-cloud:rocky-linux-8 - rocky-linux-cloud:rocky-linux-9 - suse-cloud:sles-15 - ubuntu-os-cloud:ubuntu-2004-lts - ubuntu-os-cloud:ubuntu-2204-lts - model: h100 platforms: - ubuntu-os-cloud:ubuntu-minimal-2004-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits expected_metrics: - type: agent.googleapis.com/gpu/utilization value_type: DOUBLE kind: GAUGE monitored_resources: [gce_instance] labels: - name: model value_regex: .* - name: uuid value_regex: .* - name: gpu_number value_regex: "[0-9]*" representative: true - type: agent.googleapis.com/gpu/memory/bytes_used value_type: INT64 kind: GAUGE monitored_resources: [gce_instance] labels: - name: model value_regex: .* - name: uuid value_regex: .* - name: gpu_number value_regex: "[0-9]*" - name: memory_state value_regex: free|used - type: agent.googleapis.com/gpu/processes/utilization value_type: DOUBLE kind: GAUGE monitored_resources: [gce_instance] labels: - name: model value_regex: .* - name: uuid value_regex: .* - name: gpu_number value_regex: "[0-9]*" - name: pid value_regex: "[0-9]*" - name: process value_regex: .* - name: command value_regex: .* - name: command_line value_regex: .* - name: owner value_regex: .* - type: agent.googleapis.com/gpu/processes/max_bytes_used value_type: INT64 kind: GAUGE monitored_resources: [gce_instance] labels: - name: model value_regex: .* - name: uuid value_regex: .* - name: gpu_number value_regex: "[0-9]*" - name: pid value_regex: "[0-9]*" - name: process value_regex: .* - name: command value_regex: .* - name: command_line value_regex: .* - name: owner value_regex: .* configuration_options: metrics: - type: hostmetrics fields: - name: type default: null description: This value must be `hostmetrics`. - name: collection_interval default: 60s description: A [time duration](https://pkg.go.dev/time#ParseDuration) value, such as `30s` or `5m`. minimum_supported_agent_version: metrics: 2.38.0