in cmd/nvidia_gpu/nvidia_gpu.go [73:151]
func main() {
flag.Parse()
glog.Infoln("device-plugin started")
mountPaths := []pluginapi.Mount{
{HostPath: *hostPathPrefix, ContainerPath: *containerPathPrefix, ReadOnly: true},
{HostPath: *hostVulkanICDPathPrefix, ContainerPath: *containerVulkanICDPathPrefix, ReadOnly: true}}
var gpuConfig gpumanager.GPUConfig
if *gpuConfigFile != "" {
glog.Infof("Reading GPU config file: %s", *gpuConfigFile)
var err error
gpuConfig, err = parseGPUConfig(*gpuConfigFile)
if err != nil {
glog.Infof("Failed to parse GPU config file %s: %v", *gpuConfigFile, err)
glog.Infof("Falling back to default GPU config.")
gpuConfig = gpumanager.GPUConfig{}
}
}
err := gpuConfig.AddHealthCriticalXid()
if err != nil {
glog.Infof("Failed to Add HealthCriticalXid : %v", err)
}
glog.Infof("Using gpu config: %v", gpuConfig)
ngm := gpumanager.NewNvidiaGPUManager(devDirectory, procDirectory, mountPaths, gpuConfig)
// Retry until nvidiactl and nvidia-uvm are detected. This is required
// because Nvidia drivers may not be installed initially.
for {
err := ngm.CheckDevicePaths()
if err == nil {
break
}
// Use non-default level to avoid log spam.
glog.V(3).Infof("nvidiaGPUManager.CheckDevicePaths() failed: %v", err)
time.Sleep(5 * time.Second)
}
if ret := nvml.Init(); ret != nvml.SUCCESS {
glog.Fatalf("failed to initialize nvml: %v", nvml.ErrorString(ret))
}
defer nvml.Shutdown()
for {
err := ngm.Start()
if err == nil {
break
}
glog.Errorf("failed to start GPU device manager: %v", err)
time.Sleep(5 * time.Second)
}
if *enableContainerGPUMetrics {
if gpuConfig.GPUPartitionSize != "" {
glog.Info("Using multi-instance GPU, metrics are not supported.")
} else {
glog.Infof("Starting metrics server on port: %d, endpoint path: %s, collection frequency: %d", *gpuMetricsPort, "/metrics", *gpuMetricsCollectionIntervalMs)
metricServer := metrics.NewMetricServer(*gpuMetricsCollectionIntervalMs, *gpuMetricsPort, "/metrics")
err := metricServer.Start()
if err != nil {
glog.Infof("Failed to start metric server: %v", err)
return
}
defer metricServer.Stop()
}
}
if *enableHealthMonitoring {
hc := healthcheck.NewGPUHealthChecker(ngm.ListPhysicalDevices(), ngm.Health, ngm.ListHealthCriticalXid())
if err := hc.Start(); err != nil {
glog.Infof("Failed to start GPU Health Checker: %v", err)
return
}
defer hc.Stop()
}
ngm.Serve(*pluginMountPath, kubeletEndpoint, fmt.Sprintf("%s-%d.sock", pluginEndpointPrefix, time.Now().Unix()))
}