in pkg/gpu/nvidia/manager.go [376:410]
func (ngm *nvidiaGPUManager) Start() error {
ngm.defaultDevices = []string{ngm.nvidiaCtlDevicePath, ngm.nvidiaUVMDevicePath}
nvidiaModesetDevicePath := path.Join(ngm.devDirectory, nvidiaModesetDevice)
if _, err := os.Stat(nvidiaModesetDevicePath); err == nil {
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaModesetDevicePath)
}
nvidiaUVMToolsDevicePath := path.Join(ngm.devDirectory, nvidiaUVMToolsDevice)
if _, err := os.Stat(nvidiaUVMToolsDevicePath); err == nil {
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevicePath)
}
if err := ngm.discoverGPUs(); err != nil {
return err
}
if ngm.gpuConfig.GPUPartitionSize != "" {
if err := ngm.migDeviceManager.Start(ngm.gpuConfig.GPUPartitionSize); err != nil {
return fmt.Errorf("failed to start mig device manager: %v", err)
}
}
if ngm.gpuConfig.GPUSharingConfig.GPUSharingStrategy == "mps" {
if err := ngm.isMpsHealthy(); err != nil {
return fmt.Errorf("NVIDIA MPS is not running on this node: %v", err)
}
ngm.mountPaths = append(ngm.mountPaths, pluginapi.Mount{HostPath: nvidiaMpsDir, ContainerPath: nvidiaMpsDir, ReadOnly: false})
var err error
ngm.totalMemPerGPU, err = totalMemPerGPU()
if err != nil {
return fmt.Errorf("failed to query total memory available per GPU: %v", err)
}
}
return nil
}