nvidia-persistenced-installer/nvidia_persistenced_installer.go (129 lines of code) (raw):
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"context"
"flag"
"fmt"
"os"
"os/exec"
"os/signal"
"regexp"
"strconv"
"strings"
"syscall"
"time"
"github.com/golang/glog"
)
const (
minUVMSupportedVersion = 550
)
var (
readFile = os.ReadFile
containerPathPrefix = flag.String("container-path", "/usr/local/nvidia", "Path on the container that mounts host nvidia install directory")
cgpuConfigFile = flag.String("cgpu-config", "/etc/nvidia/confidential_node_type.txt", "File with Confidential Node Type used on Node")
readyDelay = flag.Int64("ready-delay-ms", 1000, "How much time to wait before setting GPU to ready state. Adding a delay helps to reduce the chances of a start up error.")
)
func main() {
flag.Parse()
ctx := context.Background()
// Only run persistence daemon on confidential GPU nodes.
enabled, err := checkConfidentialGPUEnablement(ctx)
if err != nil {
glog.ExitContextf(ctx, "parseCGPUConfig failed: %v", err)
}
if enabled {
// This is necessary to be able to use nvidia smi from the container to set the GPU to a ready state.
if err := updateContainerLdCache(); err != nil {
glog.ExitContextf(ctx, "updateContainerLdCache failed: %v", err)
}
if err := enablePersistenceMode(ctx); err != nil {
glog.ExitContextf(ctx, "failed to start persistence mode: %v", err)
}
// Add small delay before setting the ready state for consistency.
// If the workload starts too close to when the persistence daemon has started sometimes there can be errors.
time.Sleep(time.Duration(*readyDelay) * time.Millisecond)
if err := setGPUReadyState(ctx); err != nil {
glog.ExitContextf(ctx, "failed to set gpu to ready state: %v", err)
}
} else {
glog.InfoContext(ctx, "Confidential GPU is NOT enabled, skipping nvidia persistenced enablement.")
// Don't exit as this is intended for a side car which would cause it to restart infinitely.
}
// Need to keep the container running so that the nvidia persistence daemon can keep running.
// Create a channel to listen for termination signals (SIGINT and SIGTERM)
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Block indefinitely until a signal is received
sig := <-sigChan
fmt.Printf("Received signal: %v. Shutting down...\n", sig)
}
func enablePersistenceMode(ctx context.Context) error {
glog.InfoContext(ctx, "Starting NVIDIA persistence daemon.")
cmdArgs := []string{}
if versionMajor, err := nvidiaVersionMajor(ctx); err != nil {
return err
} else if versionMajor >= minUVMSupportedVersion {
// UVM persistence mode is only available starting at R550.
cmdArgs = append(cmdArgs, "--uvm-persistence-mode")
glog.InfoContext(ctx, "using --uvm-persistence-mode")
}
cmdArgs = append(cmdArgs, "--nvidia-cfg-path="+*containerPathPrefix+"/lib64")
persistencedCMD := exec.Command(*containerPathPrefix+"/bin/nvidia-persistenced", cmdArgs...)
if err := persistencedCMD.Run(); err != nil {
return err
}
glog.InfoContext(ctx, "NVIDIA Persistence Mode Enabled.")
return nil
}
func setGPUReadyState(ctx context.Context) error {
gpuReadyCMD := exec.Command(*containerPathPrefix+"/bin/nvidia-smi", "conf-compute", "-srs", "1")
if err := gpuReadyCMD.Run(); err != nil {
return err
}
glog.InfoContext(ctx, "Confidential GPU is ready.")
return nil
}
func updateContainerLdCache() error {
f, err := os.Create("/etc/ld.so.conf.d/nvidia.conf")
if err != nil {
f.Close()
return fmt.Errorf("failed to update ld cache: %w", err)
}
f.WriteString(*containerPathPrefix + "/lib64")
f.Close()
err = exec.Command("ldconfig").Run()
if err != nil {
return fmt.Errorf("failed to update ld cache: %w", err)
}
return nil
}
func getLoadedNVIDIAKernelModuleVersion(ctx context.Context, versionFilePath string) string {
glog.InfoContextf(ctx, "Attempting to read nvidia gpu driver version from: %s", versionFilePath)
content, err := os.ReadFile(versionFilePath)
if err != nil {
glog.ErrorContextf(ctx, "Failed to read version file: %v", err)
return ""
}
contentStr := string(content)
kernelModuleVersionPattern := regexp.MustCompile(`\d+\.\d+\.\d+`)
kernelModuleVersion := kernelModuleVersionPattern.FindString(contentStr)
glog.InfoContextf(ctx, "nvidia gpu driver version: %s", kernelModuleVersion)
return kernelModuleVersion
}
func nvidiaVersionMajor(ctx context.Context) (int, error) {
version := getLoadedNVIDIAKernelModuleVersion(ctx, "/proc/driver/nvidia/version")
if version == "" {
return 0, fmt.Errorf("failed to read nvidia gpu driver version at /proc/driver/nvidia/version")
}
// Will be in this format as it was validated by the regex beforehand: 535.230.02
before, _, found := strings.Cut(version, ".")
if !found || len(before) != 3 {
return 0, fmt.Errorf("invalid nvidia gpu driver version: %v", version)
}
versionMajor, err := strconv.Atoi(before)
if err != nil {
return 0, fmt.Errorf("invalid nvidia gpu driver version(%v), %w", version, err)
}
return versionMajor, nil
}
func checkConfidentialGPUEnablement(ctx context.Context) (bool, error) {
file, err := readFile(*cgpuConfigFile)
if err != nil {
// Treat non existence of file as disabled.
if os.IsNotExist(err) {
glog.InfoContextf(ctx, "confidential node type file not found at %v, skipping persistenced installation", *cgpuConfigFile)
return false, nil
}
return false, err
}
// Remove any trailing spaces and null strings to avoid issues in comparison.
confidentialNodeType := strings.ToLower(strings.Trim(string(file), " \r\n\x00"))
return confidentialNodeType == "tdx", nil
}