in pkg/metrics/scheduler.go [72:166]
func InitSchedulerMetrics() *SchedulerMetrics {
s := &SchedulerMetrics{
lock: locking.RWMutex{},
}
s.nodeResourceUsage = make(map[string]*prometheus.GaugeVec) // Note: This map might be updated at runtime
s.containerAllocation = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "container_allocation_attempt_total",
Help: "Total number of attempts to allocate containers. State of the attempt includes `allocated`, `rejected`, `error`, `released`",
}, []string{"state"})
s.applicationSubmission = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "application_submission_total",
Help: "Total number of application submissions. State of the attempt includes `new`, `accepted` and `rejected`.",
}, []string{"result"})
s.application = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "application_total",
Help: "Total number of applications. State of the application includes `running`, `resuming`, `failing`, `completing`, `completed` and `failed`.",
}, []string{"state"})
s.node = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "node",
Help: "Total number of nodes. State of the node includes `active` and `failed`.",
}, []string{"state"})
s.schedulingLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "scheduling_latency_milliseconds",
Help: "Latency of the main scheduling routine, in seconds.",
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms
},
)
s.sortingLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "node_sorting_latency_milliseconds",
Help: "Latency of all nodes sorting, in seconds.",
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms
}, []string{"level"})
s.tryNodeLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "trynode_latency_milliseconds",
Help: "Latency of node condition checks for container allocations, such as placement constraints, in seconds.",
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8),
},
)
s.tryPreemptionLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: SchedulerSubsystem,
Name: "trypreemption_latency_milliseconds",
Help: "Latency of preemption condition checks for container allocations, in seconds.",
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8),
},
)
// Register the metrics
var metricsList = []prometheus.Collector{
s.containerAllocation,
s.applicationSubmission,
s.application,
s.node,
s.schedulingLatency,
s.sortingLatency,
s.tryNodeLatency,
s.tryPreemptionLatency,
}
for _, metric := range metricsList {
if err := prometheus.Register(metric); err != nil {
log.Log(log.Metrics).Warn("failed to register metrics collector", zap.Error(err))
}
}
return s
}