in partition_gpu/partition_gpu.go [129:204]
func main() {
flag.Parse()
if _, err := os.Stat(*gpuConfigFile); os.IsNotExist(err) {
glog.Infof("No GPU config file given, nothing to do.")
return
}
gpuConfig, err := parseGPUConfig(*gpuConfigFile)
if err != nil {
glog.Infof("failed to parse GPU config file, taking no action.")
return
}
glog.Infof("Using gpu config: %v", gpuConfig)
if gpuConfig.GPUPartitionSize == "" {
glog.Infof("No GPU partitions are required, exiting")
return
}
if _, err := os.Stat(*nvidiaSmiPath); os.IsNotExist(err) {
glog.Errorf("nvidia-smi path %s not found: %v", *nvidiaSmiPath, err)
os.Exit(1)
}
migModeEnabled, err := currentMigMode()
if err != nil {
glog.Errorf("Failed to check if MIG mode is enabled: %v", err)
os.Exit(1)
}
if !migModeEnabled {
glog.Infof("MIG mode is not enabled. Enabling now.")
glog.Infof("Checking the GPU type now.")
gpuType, err := checkGpuType()
if err != nil {
glog.Errorf("Failed to check GPU Type: %v", err)
os.Exit(1)
}
glog.Infof("Got GPU type used: %s", gpuType)
if err := enableMigMode(); err != nil {
glog.Errorf("Failed to enable MIG mode: %v", err)
os.Exit(1)
}
// On NVIDIA Ampere GPUs, when MIG mode is enabled, the driver will attempt to reset the GPU so that MIG mode can take effect.
// Starting with the Hopper generation of GPUs, enabling MIG mode no longer requires a GPU reset to take effect.
// See https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#enable-mig-mode for more information
if gpuType == Nvidia40gbA100 || gpuType == Nvidia80gbA100 {
glog.Infof("Rebooting node to enable MIG mode")
if err := rebootNode(); err != nil {
glog.Errorf("Failed to trigger node reboot after enabling MIG mode: %v", err)
}
// Exit, since we cannot proceed until node has rebooted, for MIG changes to take effect on NVIDIA Ampere GPUs.
os.Exit(1)
}
}
glog.Infof("MIG mode is enabled on all GPUs, proceeding to create GPU partitions.")
glog.Infof("Cleaning up any existing GPU partitions")
if err := cleanupAllGPUPartitions(); err != nil {
glog.Errorf("Failed to cleanup GPU partitions: %v", err)
os.Exit(1)
}
glog.Infof("Creating new GPU partitions")
if err := createGPUPartitions(gpuConfig.GPUPartitionSize); err != nil {
glog.Errorf("Failed to create GPU partitions: %v", err)
os.Exit(1)
}
glog.Infof("Running %s", *nvidiaSmiPath)
out, err := exec.Command(*nvidiaSmiPath).Output()
if err != nil {
glog.Errorf("Failed to run nvidia-smi, output: %s, error: %v", string(out), err)
}
glog.Infof("Output:\n %s", string(out))
}