in pkg/gpu/nvidia/metrics/metrics.go [137:161]
func (m *MetricServer) Start() error {
glog.Infoln("Starting metrics server")
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to query nvml: %v", nvml.ErrorString(ret))
}
glog.Infof("nvml initialized successfully. Driver version: %s", driverVersion)
err := DiscoverGPUDevices()
if err != nil {
return fmt.Errorf("failed to discover GPU devices: %v", err)
}
go func() {
http.Handle(m.metricsEndpointPath, promhttp.Handler())
err := http.ListenAndServe(fmt.Sprintf(":%d", m.port), nil)
if err != nil {
glog.Infof("Failed to start metric server: %v", err)
}
}()
go m.collectMetrics()
return nil
}