dlvm/gcp-gpu-utilization-metrics/report_gpu_metrics.py (133 lines of code) (raw):

# Copyright 2018 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Report GPU metrics. Installs a monitoring agent that monitors the GPU usage on the instance. This will auto create the GPU metrics. """ from enum import Enum import argparse import csv import subprocess import time import requests from google.cloud import monitoring_v3 METADATA_SERVER = 'http://metadata/computeMetadata/v1/instance/' METADATA_FLAVOR = {'Metadata-Flavor': 'Google'} class GpuMetrics(Enum): """Types of metrics.""" TIMESTAMP = 'timestamp' NAME = 'name' PCI_BUS_ID = 'pci.bus_id' DRIVER_VERSION = 'driver_version' PSTATE = 'pstate' PCIE_LINK_GEN_MAX = 'pcie.link.gen.max' PCIE_LINK_GEN_CURRENT = 'pcie.link.gen.current' TEMPERATURE_GPU = 'temperature.gpu' UTILIZATION_GPU = 'utilization.gpu' UTILIZATION_MEMORY = 'utilization.memory' MEMORY_TOTAL = 'memory.total' MEMORY_FREE = 'memory.free' MEMORY_USED = 'memory.used' @classmethod def all(cls): return ( cls.TIMESTAMP, cls.NAME, cls.PCI_BUS_ID, cls.PSTATE, cls.PCIE_LINK_GEN_MAX, cls.PCIE_LINK_GEN_CURRENT, cls.TEMPERATURE_GPU, cls.UTILIZATION_GPU, cls.UTILIZATION_MEMORY, cls.MEMORY_TOTAL, cls.MEMORY_FREE, cls.MEMORY_USED) @classmethod def validate(cls, key): if key not in cls.all(): raise ValueError('Invalid metric key provided: %s.' % key) def get_args(): """Argument parser. Returns: Dictionary of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--sleep', type=int, default=15, help='number of seconds to wait while collecting metrics, default=15') args, _ = parser.parse_known_args() return args def report_metric(value, metric_type, resource_values): """Create time series for report. Args: value: (int) Report metric value. metric_type: (str) Metric type resource_values: (dict) Contains resources information """ client = resource_values.get('client') project_id = resource_values.get('project_id') instance_id = resource_values.get('instance_id') zone = resource_values.get('zone') project_name = client.common_project_path(project_id) # TimeSeries definition. series = monitoring_v3.types.TimeSeries() series.metric.type = 'custom.googleapis.com/{type}'.format(type=metric_type) series.resource.type = 'gce_instance' series.resource.labels['instance_id'] = instance_id series.resource.labels['zone'] = zone series.resource.labels['project_id'] = project_id now = time.time() seconds = int(now) nanos = int((now - seconds) * 10 ** 9) interval = monitoring_v3.TimeInterval( {"end_time": {"seconds": seconds, "nanos": nanos}} ) point = monitoring_v3.Point({"interval": interval, "value": {"int64_value": value}}) series.points = [point] client.create_time_series(name=project_name, time_series=[series]) def get_nvidia_smi_utilization(gpu_query_metric): """Obtain NVIDIA SMI utilization. Args: gpu_query_metric: (str) GPU query name. Returns: An `int` of smi utilization. Average in file """ csv_file_path = '/tmp/nvidia_smi_metrics.csv' lines = 0 usage = 0 subprocess.check_call([ '/bin/bash', '-c', 'nvidia-smi --query-gpu={gpu_query_metric} -u --format=csv' ' > {csv_file_path}'.format( gpu_query_metric=gpu_query_metric.value, csv_file_path=csv_file_path) ]) with open(csv_file_path) as csvfile: rows = csv.reader(csvfile, delimiter=' ') for row in rows: lines += 1 if lines > 1: usage += int(row[0]) # Calculate average return int(usage / (lines - 1)) def get_metric_value(metric_name=''): """Supported metric names: timestamp name pci.bus_id driver_version pstate pcie.link.gen.max temperature.gpu utilization.gpu utilization.memory memory.total memory.free memory.used https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia -smi-queries Args: metric_name: (str) Metric name Returns: An `int` of smi utilization. """ return get_nvidia_smi_utilization(metric_name) def report_metrics(resource_values, sleep_time, metrics): """Collects metrics Args: resource_values:(dict) Dict to pass to Stackdriver. sleep_time:(int) Wait time. metrics:(dict) Metrics to collect. Returns: """ while True: report_metric( value=get_metric_value(metrics.get('utilization_memory')), metric_type='utilization_memory', resource_values=resource_values) report_metric( value=get_metric_value(metrics.get('utilization_gpu')), metric_type='utilization_gpu', resource_values=resource_values) report_metric( value=get_metric_value(metrics.get('memory_used')), metric_type='memory_used', resource_values=resource_values) time.sleep(sleep_time) def _get_resource_values(): """Get Resources Values :return: """ # Get instance information data = requests.get('{}zone'.format(METADATA_SERVER), headers=METADATA_FLAVOR).text instance_id = requests.get( '{}id'.format(METADATA_SERVER), headers=METADATA_FLAVOR).text client = monitoring_v3.MetricServiceClient() # Collect zone zone = data.split('/')[3] # Collect project id project_id = data.split('/')[1] resource_values = { 'client': client, 'instance_id': instance_id, 'zone': zone, 'project_id': project_id } return resource_values def main(args): resource_values = _get_resource_values() # Dictionary with default metrics. metrics = { 'utilization_memory': GpuMetrics.UTILIZATION_MEMORY, 'utilization_gpu': GpuMetrics.UTILIZATION_GPU, 'memory_used': GpuMetrics.MEMORY_USED } report_metrics(resource_values, args.sleep, metrics) if __name__ == '__main__': args = get_args() main(args)