func()

in pkg/gpu/nvidia/manager.go [376:410]


func (ngm *nvidiaGPUManager) Start() error {
	ngm.defaultDevices = []string{ngm.nvidiaCtlDevicePath, ngm.nvidiaUVMDevicePath}

	nvidiaModesetDevicePath := path.Join(ngm.devDirectory, nvidiaModesetDevice)
	if _, err := os.Stat(nvidiaModesetDevicePath); err == nil {
		ngm.defaultDevices = append(ngm.defaultDevices, nvidiaModesetDevicePath)
	}

	nvidiaUVMToolsDevicePath := path.Join(ngm.devDirectory, nvidiaUVMToolsDevice)
	if _, err := os.Stat(nvidiaUVMToolsDevicePath); err == nil {
		ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevicePath)
	}

	if err := ngm.discoverGPUs(); err != nil {
		return err
	}
	if ngm.gpuConfig.GPUPartitionSize != "" {
		if err := ngm.migDeviceManager.Start(ngm.gpuConfig.GPUPartitionSize); err != nil {
			return fmt.Errorf("failed to start mig device manager: %v", err)
		}
	}

	if ngm.gpuConfig.GPUSharingConfig.GPUSharingStrategy == "mps" {
		if err := ngm.isMpsHealthy(); err != nil {
			return fmt.Errorf("NVIDIA MPS is not running on this node: %v", err)
		}
		ngm.mountPaths = append(ngm.mountPaths, pluginapi.Mount{HostPath: nvidiaMpsDir, ContainerPath: nvidiaMpsDir, ReadOnly: false})
		var err error
		ngm.totalMemPerGPU, err = totalMemPerGPU()
		if err != nil {
			return fmt.Errorf("failed to query total memory available per GPU: %v", err)
		}
	}
	return nil
}