pkg/gpu/nvidia/metrics/devices.go (103 lines of code) (raw):
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package metrics
import (
"context"
"fmt"
"net"
"regexp"
"time"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/gpusharing"
"github.com/golang/glog"
"google.golang.org/grpc"
podresources "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)
var (
socketPath = "/var/lib/kubelet/pod-resources/kubelet.sock"
gpuResourceName = "nvidia.com/gpu"
gpuPathRegex = regexp.MustCompile("/dev/(nvidia[0-9]+)$")
connectionTimeout = 10 * time.Second
gpuDevices map[string]*nvml.Device
)
// ContainerID uniquely identifies a container.
type ContainerID struct {
namespace string
pod string
container string
}
// GetDevicesForAllContainers returns a map with container as the key and the list of devices allocated to that container as the value.
// It will skip time-shared GPU devices when time-sharing solution is enabled.
func GetDevicesForAllContainers() (map[ContainerID][]string, error) {
containerDevices := make(map[ContainerID][]string)
conn, err := grpc.Dial(
socketPath,
grpc.WithInsecure(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}))
defer func() {
err := conn.Close()
if err != nil {
glog.Warningf("Failed to close grpc connection to kubelet PodResourceLister endpoint: %v", err)
}
}()
if err != nil {
return containerDevices, fmt.Errorf("error connecting to kubelet PodResourceLister service: %v", err)
}
client := podresources.NewPodResourcesListerClient(conn)
resp, err := client.List(context.Background(), &podresources.ListPodResourcesRequest{})
if err != nil {
return containerDevices, fmt.Errorf("error listing pod resources: %v", err)
}
for _, pod := range resp.PodResources {
container := ContainerID{
namespace: pod.Namespace,
pod: pod.Name,
}
for _, c := range pod.Containers {
container.container = c.Name
for _, d := range c.Devices {
if len(d.DeviceIds) == 0 || d.ResourceName != gpuResourceName {
continue
}
var devices []string
for _, deviceID := range d.DeviceIds {
if gpusharing.IsVirtualDeviceID(deviceID) {
continue
}
devices = append(devices, deviceID)
}
containerDevices[container] = append(containerDevices[container], devices...)
}
}
}
return containerDevices, nil
}
func GetAllGpuDevices() map[string]*nvml.Device {
return gpuDevices
}
// DiscoverGPUDevices discovers GPUs attached to the node, and updates `gpuDevices` map.
func DiscoverGPUDevices() error {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device count: %s", nvml.ErrorString(ret))
}
glog.Infof("Found %d GPU devices", count)
gpuDevices = make(map[string]*nvml.Device)
for i := int(0); i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to read device with index %d: %v", i, nvml.ErrorString(ret))
}
minor, ret := device.GetMinorNumber()
if ret != nvml.SUCCESS {
glog.Errorf("Invalid GPU device minor number found. Skipping this device")
}
deviceName := fmt.Sprintf("nvidia%d", minor)
glog.Infof("Found device %s for metrics collection", deviceName)
gpuDevices[deviceName] = &device
}
return nil
}
// DeviceFromName returns the device object for a given device name.
func DeviceFromName(deviceName string) (*nvml.Device, error) {
device, ok := gpuDevices[deviceName]
if !ok {
return &nvml.Device{}, fmt.Errorf("device %s not found", deviceName)
}
return device, nil
}