cmd/nvidia_gpu/nvidia_gpu.go (114 lines of code) (raw):
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"time"
gpumanager "github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia"
healthcheck "github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/health_check"
"github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/metrics"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/golang/glog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
const (
// Device plugin settings.
kubeletEndpoint = "kubelet.sock"
pluginEndpointPrefix = "nvidiaGPU"
devDirectory = "/dev"
// Proc directory is used to lookup the access files for each GPU partition.
procDirectory = "/proc"
)
var (
hostPathPrefix = flag.String("host-path", "/home/kubernetes/bin/nvidia", "Path on the host that contains nvidia libraries. This will be mounted inside the container as '-container-path'")
containerPathPrefix = flag.String("container-path", "/usr/local/nvidia", "Path on the container that mounts '-host-path'")
hostVulkanICDPathPrefix = flag.String("host-vulkan-icd-path", "/home/kubernetes/bin/nvidia/vulkan/icd.d", "Path on the host that contains the Nvidia Vulkan installable client driver. This will be mounted inside the container as '-container-vulkan-icd-path'")
containerVulkanICDPathPrefix = flag.String("container-vulkan-icd-path", "/etc/vulkan/icd.d", "Path on the container that mounts '-host-vulkan-icd-path'")
pluginMountPath = flag.String("plugin-directory", "/device-plugin", "The directory path to create plugin socket")
enableContainerGPUMetrics = flag.Bool("enable-container-gpu-metrics", false, "If true, the device plugin will expose GPU metrics for containers with allocated GPU")
enableHealthMonitoring = flag.Bool("enable-health-monitoring", false, "If true, the device plugin will detect critical Xid errors and mark the GPUs unallocatable")
gpuMetricsPort = flag.Int("gpu-metrics-port", 2112, "Port on which GPU metrics for containers are exposed")
gpuMetricsCollectionIntervalMs = flag.Int("gpu-metrics-collection-interval", 30000, "Collection interval (in milli seconds) for container GPU metrics")
gpuConfigFile = flag.String("gpu-config", "/etc/nvidia/gpu_config.json", "File with GPU configurations for device plugin")
)
func parseGPUConfig(gpuConfigFile string) (gpumanager.GPUConfig, error) {
var gpuConfig gpumanager.GPUConfig
gpuConfigContent, err := ioutil.ReadFile(gpuConfigFile)
if err != nil {
return gpuConfig, fmt.Errorf("unable to read gpu config file %s: %v", gpuConfigFile, err)
}
if err = json.Unmarshal(gpuConfigContent, &gpuConfig); err != nil {
return gpuConfig, fmt.Errorf("failed to parse GPU config file contents: %s, error: %v", gpuConfigContent, err)
}
err = gpuConfig.AddDefaultsAndValidate()
if err != nil {
return gpumanager.GPUConfig{}, err
}
return gpuConfig, nil
}
func main() {
flag.Parse()
glog.Infoln("device-plugin started")
mountPaths := []pluginapi.Mount{
{HostPath: *hostPathPrefix, ContainerPath: *containerPathPrefix, ReadOnly: true},
{HostPath: *hostVulkanICDPathPrefix, ContainerPath: *containerVulkanICDPathPrefix, ReadOnly: true}}
var gpuConfig gpumanager.GPUConfig
if *gpuConfigFile != "" {
glog.Infof("Reading GPU config file: %s", *gpuConfigFile)
var err error
gpuConfig, err = parseGPUConfig(*gpuConfigFile)
if err != nil {
glog.Infof("Failed to parse GPU config file %s: %v", *gpuConfigFile, err)
glog.Infof("Falling back to default GPU config.")
gpuConfig = gpumanager.GPUConfig{}
}
}
err := gpuConfig.AddHealthCriticalXid()
if err != nil {
glog.Infof("Failed to Add HealthCriticalXid : %v", err)
}
glog.Infof("Using gpu config: %v", gpuConfig)
ngm := gpumanager.NewNvidiaGPUManager(devDirectory, procDirectory, mountPaths, gpuConfig)
// Retry until nvidiactl and nvidia-uvm are detected. This is required
// because Nvidia drivers may not be installed initially.
for {
err := ngm.CheckDevicePaths()
if err == nil {
break
}
// Use non-default level to avoid log spam.
glog.V(3).Infof("nvidiaGPUManager.CheckDevicePaths() failed: %v", err)
time.Sleep(5 * time.Second)
}
if ret := nvml.Init(); ret != nvml.SUCCESS {
glog.Fatalf("failed to initialize nvml: %v", nvml.ErrorString(ret))
}
defer nvml.Shutdown()
for {
err := ngm.Start()
if err == nil {
break
}
glog.Errorf("failed to start GPU device manager: %v", err)
time.Sleep(5 * time.Second)
}
if *enableContainerGPUMetrics {
if gpuConfig.GPUPartitionSize != "" {
glog.Info("Using multi-instance GPU, metrics are not supported.")
} else {
glog.Infof("Starting metrics server on port: %d, endpoint path: %s, collection frequency: %d", *gpuMetricsPort, "/metrics", *gpuMetricsCollectionIntervalMs)
metricServer := metrics.NewMetricServer(*gpuMetricsCollectionIntervalMs, *gpuMetricsPort, "/metrics")
err := metricServer.Start()
if err != nil {
glog.Infof("Failed to start metric server: %v", err)
return
}
defer metricServer.Stop()
}
}
if *enableHealthMonitoring {
hc := healthcheck.NewGPUHealthChecker(ngm.ListPhysicalDevices(), ngm.Health, ngm.ListHealthCriticalXid())
if err := hc.Start(); err != nil {
glog.Infof("Failed to start GPU Health Checker: %v", err)
return
}
defer hc.Stop()
}
ngm.Serve(*pluginMountPath, kubeletEndpoint, fmt.Sprintf("%s-%d.sock", pluginEndpointPrefix, time.Now().Unix()))
}