pkg/gpu/nvidia/mig/mig.go (215 lines of code) (raw):
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mig
import (
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"strconv"
"github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/nvmlutil"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/golang/glog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
const nvidiaDeviceRE = `^nvidia[0-9]*$`
// Max number of GPU partitions that can be created for each partition size.
// Source: https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning
var (
gpuPartitionSizeMaxCount = map[string]int{
//nvidia-tesla-a100
"1g.5gb": 7,
"2g.10gb": 3,
"3g.20gb": 2,
"7g.40gb": 1,
//nvidia-a100-80gb, nvidia-h100-80gb
"1g.10gb": 7,
"2g.20gb": 3,
"3g.40gb": 2,
"7g.80gb": 1,
//nvidia-h100-80gb
"1g.20gb": 4,
//nvidia-h200-141gb
"1g.18gb": 7,
"1g.35gb": 4,
"2g.35gb": 3,
"3g.71gb": 2,
"4g.71gb": 1,
"7g.141gb": 1,
//nvidia-b200, nvidia-gb200
"1g.23gb": 7,
//nvidia-b200
"1g.45gb": 4,
"2g.45gb": 3,
"3g.90gb": 2,
"4g.90gb": 1,
"7g.180gb": 1,
//nvidia-gb200
"1g.47gb": 4,
"2g.47gb": 3,
"3g.93gb": 2,
"4g.93gb": 1,
"7g.186gb": 1,
}
pciDevicesRoot = "/sys/bus/pci/devices"
)
// DeviceManager performs various management operations on mig devices.
type DeviceManager struct {
devDirectory string
procDirectory string
gpuPartitionSpecs map[string][]pluginapi.DeviceSpec
gpuPartitions map[string]pluginapi.Device
}
// NewDeviceManager creates a new DeviceManager to handle MIG devices on the node.
func NewDeviceManager(devDirectory, procDirectory string) DeviceManager {
return DeviceManager{
devDirectory: devDirectory,
procDirectory: procDirectory,
gpuPartitionSpecs: make(map[string][]pluginapi.DeviceSpec),
gpuPartitions: make(map[string]pluginapi.Device),
}
}
// ListGPUPartitionDevices lists all the GPU partitions as devices that can be advertised as
// resources available on the node.
func (d *DeviceManager) ListGPUPartitionDevices() map[string]pluginapi.Device {
return d.gpuPartitions
}
// DeviceSpec returns the device spec that inclues list of devices to allocate for a deviceID.
func (d *DeviceManager) DeviceSpec(deviceID string) ([]pluginapi.DeviceSpec, error) {
deviceSpecs, ok := d.gpuPartitionSpecs[deviceID]
if !ok {
return []pluginapi.DeviceSpec{}, fmt.Errorf("invalid allocation request with non-existing GPU partition: %s", deviceID)
}
return deviceSpecs, nil
}
// Start method performs the necessary initializations and starts the mig.DeviceManager.
func (d *DeviceManager) Start(partitionSize string) error {
if partitionSize == "" {
return nil
}
maxPartitionCount, ok := gpuPartitionSizeMaxCount[partitionSize]
if !ok {
return fmt.Errorf("%s is not a valid GPU partition size", partitionSize)
}
d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec)
nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities")
capFiles, err := ioutil.ReadDir(nvidiaCapDir)
if err != nil {
return fmt.Errorf("failed to read capabilities directory (%s): %v", nvidiaCapDir, err)
}
gpuFileRegexp := regexp.MustCompile("gpu([0-9]+)")
giFileRegexp := regexp.MustCompile("gi([0-9]+)")
deviceRegexp := regexp.MustCompile("DeviceFileMinor: ([0-9]+)")
numPartitionedGPUs := 0
for _, capFile := range capFiles {
m := gpuFileRegexp.FindStringSubmatch(capFile.Name())
if len(m) != 2 {
// Not a gpu, continue to next file
continue
}
gpuID := m[1]
numPartitionedGPUs++
giBasePath := path.Join(nvidiaCapDir, capFile.Name(), "mig")
giFiles, err := ioutil.ReadDir(giBasePath)
if err != nil {
return fmt.Errorf("failed to read GPU instance capabilities dir (%s): %v", giBasePath, err)
}
numPartitions := 0
for _, giFile := range giFiles {
if !giFileRegexp.MatchString(giFile.Name()) {
continue
}
numPartitions++
gpuInstanceID := "nvidia" + gpuID + "/" + giFile.Name()
giAccessFile := path.Join(giBasePath, giFile.Name(), "access")
content, err := ioutil.ReadFile(giAccessFile)
if err != nil {
return fmt.Errorf("failed to read GPU Instance access file (%s): %v", giAccessFile, err)
}
m := deviceRegexp.FindStringSubmatch(string(content))
if len(m) != 2 {
return fmt.Errorf("unexpected contents in GPU instance access file(%s): %v", giAccessFile, err)
}
giMinorDevice, err := strconv.Atoi(m[1])
if err != nil {
return fmt.Errorf("failed to find minor device from GPU instance access file(%s): %v", giAccessFile, err)
}
ciAccessFile := path.Join(giBasePath, giFile.Name(), "ci0", "access")
content, err = ioutil.ReadFile(ciAccessFile)
if err != nil {
return fmt.Errorf("unable to read Compute Instance access file (%s): %v", ciAccessFile, err)
}
m = deviceRegexp.FindStringSubmatch(string(content))
if len(m) != 2 {
return fmt.Errorf("unexpected contents in compute instance access file(%s): %v", ciAccessFile, err)
}
ciMinorDevice, err := strconv.Atoi(m[1])
if err != nil {
return fmt.Errorf("failed to find minor device from compute instance access file(%s): %v", ciAccessFile, err)
}
gpuDevice := path.Join(d.devDirectory, "nvidia"+gpuID)
if _, err := os.Stat(gpuDevice); err != nil {
return fmt.Errorf("GPU device (%s) not fount: %v", gpuDevice, err)
}
giDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(giMinorDevice))
if _, err := os.Stat(giDevice); err != nil {
return fmt.Errorf("GPU instance device (%s) not fount: %v", giDevice, err)
}
ciDevice := path.Join(d.devDirectory, "nvidia-caps", "nvidia-cap"+strconv.Itoa(ciMinorDevice))
if _, err := os.Stat(ciDevice); err != nil {
return fmt.Errorf("Compute instance device (%s) not fount: %v", ciDevice, err)
}
glog.Infof("Discovered GPU partition: %s", gpuInstanceID)
d.gpuPartitionSpecs[gpuInstanceID] = []pluginapi.DeviceSpec{
{
ContainerPath: gpuDevice,
HostPath: gpuDevice,
Permissions: "mrw",
},
{
ContainerPath: giDevice,
HostPath: giDevice,
Permissions: "mrw",
},
{
ContainerPath: ciDevice,
HostPath: ciDevice,
Permissions: "mrw",
},
}
topologyInfo, err := d.topology(gpuID)
if err != nil {
glog.Errorf("unable to get topology for device with index %d: %v", gpuID, err)
}
d.gpuPartitions[gpuInstanceID] = pluginapi.Device{ID: gpuInstanceID, Health: pluginapi.Healthy, Topology: topologyInfo}
}
if numPartitions != maxPartitionCount {
return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount)
}
}
numGPUs, err := d.discoverNumGPUs()
if err != nil {
return err
}
if numPartitionedGPUs != numGPUs {
return fmt.Errorf("Not all GPUs are partitioned as expected. Total number of GPUs: %d, number of partitioned GPUs: %d", numGPUs, numPartitionedGPUs)
}
return nil
}
// SetDeviceHealth sets the health status for a GPU partition
func (d *DeviceManager) SetDeviceHealth(name string, health string, topology *pluginapi.TopologyInfo) {
d.gpuPartitions[name] = pluginapi.Device{ID: name, Health: health, Topology: topology}
}
// Discovers all NVIDIA GPU devices available on the local node by walking nvidiaGPUManager's devDirectory.
func (d *DeviceManager) discoverNumGPUs() (int, error) {
numGPUs := 0
reg := regexp.MustCompile(nvidiaDeviceRE)
files, err := ioutil.ReadDir(d.devDirectory)
if err != nil {
return 0, fmt.Errorf("failed to read devices on node: %v", err)
}
for _, f := range files {
if f.IsDir() {
continue
}
if reg.MatchString(f.Name()) {
numGPUs++
}
}
return numGPUs, nil
}
func (d *DeviceManager) topology(deviceIndex string) (*pluginapi.TopologyInfo, error) {
index, err := strconv.Atoi(deviceIndex)
if err != nil {
return nil, fmt.Errorf("unable to convert deviceIndex %q string to int: %v", deviceIndex, err)
}
if nvmlutil.NvmlDeviceInfo == nil {
nvmlutil.NvmlDeviceInfo = &nvmlutil.DeviceInfo{}
}
device, ret := nvmlutil.NvmlDeviceInfo.DeviceHandleByIndex(index)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get mig device handle: %v", nvml.ErrorString(ret))
}
return nvmlutil.Topology(device, pciDevicesRoot)
}