func main()

in cmd/nvidia_gpu/nvidia_gpu.go [73:151]


func main() {
	flag.Parse()
	glog.Infoln("device-plugin started")
	mountPaths := []pluginapi.Mount{
		{HostPath: *hostPathPrefix, ContainerPath: *containerPathPrefix, ReadOnly: true},
		{HostPath: *hostVulkanICDPathPrefix, ContainerPath: *containerVulkanICDPathPrefix, ReadOnly: true}}

	var gpuConfig gpumanager.GPUConfig
	if *gpuConfigFile != "" {
		glog.Infof("Reading GPU config file: %s", *gpuConfigFile)
		var err error
		gpuConfig, err = parseGPUConfig(*gpuConfigFile)
		if err != nil {
			glog.Infof("Failed to parse GPU config file %s: %v", *gpuConfigFile, err)
			glog.Infof("Falling back to default GPU config.")
			gpuConfig = gpumanager.GPUConfig{}
		}
	}
	err := gpuConfig.AddHealthCriticalXid()
	if err != nil {
		glog.Infof("Failed to Add HealthCriticalXid : %v", err)
	}

	glog.Infof("Using gpu config: %v", gpuConfig)
	ngm := gpumanager.NewNvidiaGPUManager(devDirectory, procDirectory, mountPaths, gpuConfig)

	// Retry until nvidiactl and nvidia-uvm are detected. This is required
	// because Nvidia drivers may not be installed initially.
	for {
		err := ngm.CheckDevicePaths()
		if err == nil {
			break
		}
		// Use non-default level to avoid log spam.
		glog.V(3).Infof("nvidiaGPUManager.CheckDevicePaths() failed: %v", err)
		time.Sleep(5 * time.Second)
	}

	if ret := nvml.Init(); ret != nvml.SUCCESS {
		glog.Fatalf("failed to initialize nvml: %v", nvml.ErrorString(ret))
	}
	defer nvml.Shutdown()

	for {
		err := ngm.Start()
		if err == nil {
			break
		}

		glog.Errorf("failed to start GPU device manager: %v", err)
		time.Sleep(5 * time.Second)
	}

	if *enableContainerGPUMetrics {
		if gpuConfig.GPUPartitionSize != "" {
			glog.Info("Using multi-instance GPU, metrics are not supported.")
		} else {
			glog.Infof("Starting metrics server on port: %d, endpoint path: %s, collection frequency: %d", *gpuMetricsPort, "/metrics", *gpuMetricsCollectionIntervalMs)
			metricServer := metrics.NewMetricServer(*gpuMetricsCollectionIntervalMs, *gpuMetricsPort, "/metrics")
			err := metricServer.Start()
			if err != nil {
				glog.Infof("Failed to start metric server: %v", err)
				return
			}
			defer metricServer.Stop()
		}
	}

	if *enableHealthMonitoring {
		hc := healthcheck.NewGPUHealthChecker(ngm.ListPhysicalDevices(), ngm.Health, ngm.ListHealthCriticalXid())
		if err := hc.Start(); err != nil {
			glog.Infof("Failed to start GPU Health Checker: %v", err)
			return
		}
		defer hc.Stop()
	}

	ngm.Serve(*pluginMountPath, kubeletEndpoint, fmt.Sprintf("%s-%d.sock", pluginEndpointPrefix, time.Now().Unix()))
}