pkg/metrics/queue.go (275 lines of code) (raw):
/*
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"go.uber.org/zap"
"github.com/apache/yunikorn-core/pkg/common/resources"
"github.com/apache/yunikorn-core/pkg/locking"
"github.com/apache/yunikorn-core/pkg/log"
)
const (
AppNew = "new"
AppAccepted = "accepted"
AppRunning = "running"
AppFailing = "failing"
AppFailed = "failed"
AppRejected = "rejected"
AppResuming = "resuming"
AppCompleting = "completing"
AppCompleted = "completed"
AppExpired = "expired"
ContainerReleased = "released"
ContainerAllocated = "allocated"
ContainerRejected = "rejected"
QueueGuaranteed = "guaranteed"
QueueMax = "max"
QueuePending = "pending"
QueuePreempting = "preempting"
QueueMaxRunningApps = "maxRunningApps"
)
// QueueMetrics to declare queue metrics
type QueueMetrics struct {
appMetricsLabel *prometheus.GaugeVec
// Deprecated - To be removed in 1.7.0. Replaced with queue label Metrics
appMetricsSubsystem *prometheus.GaugeVec
containerMetrics *prometheus.CounterVec
resourceMetricsLabel *prometheus.GaugeVec
// Deprecated - To be removed in 1.7.0. Replaced with queue label Metrics
resourceMetricsSubsystem *prometheus.GaugeVec
// Track known resource types
knownResourceTypes map[string]struct{}
lock locking.Mutex
}
// InitQueueMetrics to initialize queue metrics
func InitQueueMetrics(name string) *QueueMetrics {
q := &QueueMetrics{}
replaceStr := formatMetricName(name)
q.appMetricsLabel = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Name: "queue_app",
ConstLabels: prometheus.Labels{"queue": name},
Help: "Queue application metrics. State of the application includes `new`, `accepted`, `rejected`, `running`, `failing`, `failed`, `resuming`, `completing`, `completed`.",
}, []string{"state"})
q.appMetricsSubsystem = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: replaceStr,
Name: "queue_app",
Help: "Queue application metrics. State of the application includes `new`, `accepted`, `rejected`, `running`, `failing`, `failed`, `resuming`, `completing`, `completed`.",
}, []string{"state"})
q.containerMetrics = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: replaceStr,
Name: "queue_container",
Help: "Queue container metrics. State of the attempt includes `allocated`, `released`.",
}, []string{"state"})
q.resourceMetricsLabel = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Name: "queue_resource",
ConstLabels: prometheus.Labels{"queue": name},
Help: "Queue resource metrics. State of the resource includes `guaranteed`, `max`, `allocated`, `pending`, `preempting`, `maxRunningApps`.",
}, []string{"state", "resource"})
q.resourceMetricsSubsystem = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: replaceStr,
Name: "queue_resource",
Help: "Queue resource metrics. State of the resource includes `guaranteed`, `max`, `allocated`, `pending`, `preempting`, `maxRunningApps`.",
}, []string{"state", "resource"})
var queueMetricsList = []prometheus.Collector{
q.appMetricsLabel,
q.appMetricsSubsystem,
q.containerMetrics,
q.resourceMetricsLabel,
q.resourceMetricsSubsystem,
}
// Register the metrics
for _, metric := range queueMetricsList {
// registration might be failed if queue name is not valid
// metrics name must be complied with regex: [a-zA-Z_:][a-zA-Z0-9_:]*,
// queue name regex: ^[a-zA-Z0-9_-]{1,64}$
if err := prometheus.Register(metric); err != nil {
log.Log(log.Metrics).Warn("failed to register metrics collector", zap.Error(err))
}
}
q.knownResourceTypes = make(map[string]struct{})
return q
}
func (m *QueueMetrics) UnregisterMetrics() {
var queueMetricsList = []prometheus.Collector{
m.appMetricsLabel,
m.appMetricsSubsystem,
m.containerMetrics,
m.resourceMetricsLabel,
m.resourceMetricsSubsystem,
}
// Unregister the metrics
for _, metric := range queueMetricsList {
prometheus.Unregister(metric)
}
}
func (m *QueueMetrics) incQueueApplications(state string) {
m.appMetricsLabel.WithLabelValues(state).Inc()
m.appMetricsSubsystem.WithLabelValues(state).Inc()
}
func (m *QueueMetrics) decQueueApplications(state string) {
m.appMetricsLabel.WithLabelValues(state).Dec()
m.appMetricsSubsystem.WithLabelValues(state).Dec()
}
func (m *QueueMetrics) setQueueResource(state string, resourceName string, value float64) {
m.resourceMetricsLabel.WithLabelValues(state, resourceName).Set(value)
m.resourceMetricsSubsystem.WithLabelValues(state, resourceName).Set(value)
}
func (m *QueueMetrics) Reset() {
m.lock.Lock()
defer m.lock.Unlock()
m.appMetricsLabel.Reset()
m.appMetricsSubsystem.Reset()
m.resourceMetricsLabel.Reset()
m.resourceMetricsSubsystem.Reset()
m.knownResourceTypes = make(map[string]struct{})
}
func (m *QueueMetrics) IncQueueApplicationsRunning() {
m.incQueueApplications(AppRunning)
}
func (m *QueueMetrics) DecQueueApplicationsRunning() {
m.decQueueApplications(AppRunning)
}
func (m *QueueMetrics) GetQueueApplicationsRunning() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppRunning).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsNew() {
m.incQueueApplications(AppNew)
}
func (m *QueueMetrics) DecQueueApplicationsNew() {
m.decQueueApplications(AppNew)
}
func (m *QueueMetrics) GetQueueApplicationsNew() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppNew).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsAccepted() {
m.incQueueApplications(AppAccepted)
}
func (m *QueueMetrics) DecQueueApplicationsAccepted() {
m.decQueueApplications(AppAccepted)
}
func (m *QueueMetrics) GetQueueApplicationsAccepted() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppAccepted).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsRejected() {
m.incQueueApplications(AppRejected)
}
func (m *QueueMetrics) GetQueueApplicationsRejected() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppRejected).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsResuming() {
m.incQueueApplications(AppResuming)
}
func (m *QueueMetrics) DecQueueApplicationsResuming() {
m.decQueueApplications(AppResuming)
}
func (m *QueueMetrics) GetQueueApplicationsResuming() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppResuming).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsFailing() {
m.incQueueApplications(AppFailing)
}
func (m *QueueMetrics) DecQueueApplicationsFailing() {
m.decQueueApplications(AppFailing)
}
func (m *QueueMetrics) GetQueueApplicationsFailing() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppFailing).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsFailed() {
m.incQueueApplications(AppFailed)
}
func (m *QueueMetrics) GetQueueApplicationsFailed() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppFailed).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsCompleting() {
m.incQueueApplications(AppCompleting)
}
func (m *QueueMetrics) DecQueueApplicationsCompleting() {
m.decQueueApplications(AppCompleting)
}
func (m *QueueMetrics) GetQueueApplicationsCompleting() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppCompleting).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncQueueApplicationsCompleted() {
m.incQueueApplications(AppCompleted)
}
func (m *QueueMetrics) GetQueueApplicationsCompleted() (int, error) {
metricDto := &dto.Metric{}
err := m.appMetricsLabel.WithLabelValues(AppCompleted).Write(metricDto)
if err == nil {
return int(*metricDto.Gauge.Value), nil
}
return -1, err
}
func (m *QueueMetrics) IncAllocatedContainer() {
m.containerMetrics.WithLabelValues(ContainerAllocated).Inc()
}
func (m *QueueMetrics) IncReleasedContainer() {
m.containerMetrics.WithLabelValues(ContainerReleased).Inc()
}
func (m *QueueMetrics) AddReleasedContainers(value int) {
m.containerMetrics.WithLabelValues(ContainerReleased).Add(float64(value))
}
func (m *QueueMetrics) UpdateQueueResourceMetrics(state string, newResources map[string]resources.Quantity) {
m.lock.Lock()
defer m.lock.Unlock()
// Iterate over new resource types and set their values
for resourceName, value := range newResources {
m.setQueueResource(state, resourceName, float64(value))
// Add new resources to the known list
m.knownResourceTypes[resourceName] = struct{}{}
}
// Emit old resource types that are missing in the new collection with zero
for resourceName := range m.knownResourceTypes {
if _, exists := newResources[resourceName]; !exists {
m.setQueueResource(state, resourceName, float64(0))
}
}
}
func (m *QueueMetrics) SetQueueAllocatedResourceMetrics(resourceName string, value float64) {
m.setQueueResource(ContainerAllocated, resourceName, value)
}
func (m *QueueMetrics) SetQueuePendingResourceMetrics(resourceName string, value float64) {
m.setQueueResource(QueuePending, resourceName, value)
}
func (m *QueueMetrics) SetQueuePreemptingResourceMetrics(resourceName string, value float64) {
m.setQueueResource(QueuePreempting, resourceName, value)
}
func (m *QueueMetrics) SetQueueMaxRunningAppsMetrics(value uint64) {
m.setQueueResource(QueueMaxRunningApps, "apps", float64(value))
}