func InitSchedulerMetrics()

in pkg/metrics/scheduler.go [72:166]


func InitSchedulerMetrics() *SchedulerMetrics {
	s := &SchedulerMetrics{
		lock: locking.RWMutex{},
	}

	s.nodeResourceUsage = make(map[string]*prometheus.GaugeVec) // Note: This map might be updated at runtime

	s.containerAllocation = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "container_allocation_attempt_total",
			Help:      "Total number of attempts to allocate containers. State of the attempt includes `allocated`, `rejected`, `error`, `released`",
		}, []string{"state"})

	s.applicationSubmission = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "application_submission_total",
			Help:      "Total number of application submissions. State of the attempt includes `new`, `accepted` and `rejected`.",
		}, []string{"result"})

	s.application = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "application_total",
			Help:      "Total number of applications. State of the application includes `running`, `resuming`, `failing`, `completing`, `completed` and `failed`.",
		}, []string{"state"})

	s.node = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "node",
			Help:      "Total number of nodes. State of the node includes `active` and `failed`.",
		}, []string{"state"})

	s.schedulingLatency = prometheus.NewHistogram(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "scheduling_latency_milliseconds",
			Help:      "Latency of the main scheduling routine, in seconds.",
			Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms
		},
	)
	s.sortingLatency = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "node_sorting_latency_milliseconds",
			Help:      "Latency of all nodes sorting, in seconds.",
			Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms
		}, []string{"level"})

	s.tryNodeLatency = prometheus.NewHistogram(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "trynode_latency_milliseconds",
			Help:      "Latency of node condition checks for container allocations, such as placement constraints, in seconds.",
			Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8),
		},
	)

	s.tryPreemptionLatency = prometheus.NewHistogram(
		prometheus.HistogramOpts{
			Namespace: Namespace,
			Subsystem: SchedulerSubsystem,
			Name:      "trypreemption_latency_milliseconds",
			Help:      "Latency of preemption condition checks for container allocations, in seconds.",
			Buckets:   prometheus.ExponentialBuckets(0.0001, 10, 8),
		},
	)

	// Register the metrics
	var metricsList = []prometheus.Collector{
		s.containerAllocation,
		s.applicationSubmission,
		s.application,
		s.node,
		s.schedulingLatency,
		s.sortingLatency,
		s.tryNodeLatency,
		s.tryPreemptionLatency,
	}
	for _, metric := range metricsList {
		if err := prometheus.Register(metric); err != nil {
			log.Log(log.Metrics).Warn("failed to register metrics collector", zap.Error(err))
		}
	}
	return s
}