pkg/metrics/init.go (165 lines of code) (raw):

/* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package metrics import ( "sync" "time" ) const ( // Namespace for all metrics inside the scheduler Namespace = "yunikorn" // SchedulerSubsystem - subsystem name used by scheduler SchedulerSubsystem = "scheduler" // EventSubsystem - subsystem name used by event cache EventSubsystem = "event" // MetricNameInvalidByteReplacement byte used to replace invalid bytes in prometheus metric names MetricNameInvalidByteReplacement = '_' ) var once sync.Once var m *Metrics type Metrics struct { scheduler CoreSchedulerMetrics queues map[string]CoreQueueMetrics event CoreEventMetrics runtime GoRuntimeMetrics lock sync.RWMutex } type CoreQueueMetrics interface { IncQueueApplicationsAccepted() IncQueueApplicationsRejected() IncQueueApplicationsRunning() DecQueueApplicationsRunning() IncQueueApplicationsFailed() IncQueueApplicationsCompleted() IncAllocatedContainer() IncReleasedContainer() SetQueueGuaranteedResourceMetrics(resourceName string, value float64) SetQueueMaxResourceMetrics(resourceName string, value float64) SetQueueAllocatedResourceMetrics(resourceName string, value float64) AddQueueAllocatedResourceMetrics(resourceName string, value float64) SetQueuePendingResourceMetrics(resourceName string, value float64) AddQueuePendingResourceMetrics(resourceName string, value float64) SetQueuePreemptingResourceMetrics(resourceName string, value float64) AddQueuePreemptingResourceMetrics(resourceName string, value float64) // Reset all metrics that implement the Reset functionality. // should only be used in tests Reset() } type GoRuntimeMetrics interface { Collect() // Reset all metrics that implement the Reset functionality. // should only be used in tests Reset() } // Declare all core metrics ops in this interface type CoreSchedulerMetrics interface { // Metrics Ops related to ScheduledAllocationSuccesses IncAllocatedContainer() AddAllocatedContainers(value int) getAllocatedContainers() (int, error) // Metrics Ops related to ScheduledAllocationFailures IncRejectedContainer() AddRejectedContainers(value int) // Metrics Ops related to ScheduledAllocationErrors IncSchedulingError() AddSchedulingErrors(value int) GetSchedulingErrors() (int, error) // Metrics Ops related to released allocations IncReleasedContainer() AddReleasedContainers(value int) getReleasedContainers() (int, error) // Metrics Ops related to totalApplicationsAccepted IncTotalApplicationsAccepted() AddTotalApplicationsAccepted(value int) // Metrics Ops related to TotalApplicationsRejected IncTotalApplicationsRejected() AddTotalApplicationsRejected(value int) // Metrics Ops related to TotalApplicationsRunning IncTotalApplicationsRunning() AddTotalApplicationsRunning(value int) DecTotalApplicationsRunning() SubTotalApplicationsRunning(value int) SetTotalApplicationsRunning(value int) getTotalApplicationsRunning() (int, error) // Metrics Ops related to TotalApplicationsFailed IncTotalApplicationsFailed() // Metrics Ops related to TotalApplicationsCompleted IncTotalApplicationsCompleted() AddTotalApplicationsCompleted(value int) DecTotalApplicationsCompleted() SubTotalApplicationsCompleted(value int) SetTotalApplicationsCompleted(value int) // Metrics Ops related to ActiveNodes IncActiveNodes() AddActiveNodes(value int) DecActiveNodes() SubActiveNodes(value int) SetActiveNodes(value int) IncDrainingNodes() DecDrainingNodes() GetDrainingNodes() (int, error) IncUnhealthyNodes() DecUnhealthyNodes() IncTotalDecommissionedNodes() // Metrics Ops related to failedNodes IncFailedNodes() AddFailedNodes(value int) DecFailedNodes() SubFailedNodes(value int) SetFailedNodes(value int) SetNodeResourceUsage(resourceName string, rangeIdx int, value float64) GetFailedNodes() (int, error) // Metrics Ops related to latency change ObserveSchedulingLatency(start time.Time) ObserveNodeSortingLatency(start time.Time) ObserveAppSortingLatency(start time.Time) ObserveQueueSortingLatency(start time.Time) ObserveTryNodeLatency(start time.Time) ObserveTryPreemptionLatency(start time.Time) // Reset all metrics that implement the Reset functionality. // should only be used in tests Reset() } type CoreEventMetrics interface { IncEventsCreated() IncEventsChanneled() IncEventsNotChanneled() IncEventsProcessed() IncEventsStored() IncEventsNotStored() AddEventsCollected(collectedEvents int) // Reset all metrics that implement the Set functionality. // Should only be used in tests Reset() } func init() { once.Do(func() { m = &Metrics{ scheduler: InitSchedulerMetrics(), queues: make(map[string]CoreQueueMetrics), event: initEventMetrics(), lock: sync.RWMutex{}, runtime: initRuntimeMetrics(), } }) } func Reset() { m.lock.Lock() defer m.lock.Unlock() m.scheduler.Reset() m.event.Reset() for _, qm := range m.queues { qm.Reset() } m.runtime.Reset() } func GetSchedulerMetrics() CoreSchedulerMetrics { return m.scheduler } func GetQueueMetrics(name string) CoreQueueMetrics { m.lock.Lock() defer m.lock.Unlock() if qm, ok := m.queues[name]; ok { return qm } queueMetrics := InitQueueMetrics(name) m.queues[name] = queueMetrics return queueMetrics } func GetEventMetrics() CoreEventMetrics { return m.event } func GetRuntimeMetrics() GoRuntimeMetrics { return m.runtime } // Format metric name based on the definition of metric name in prometheus, as per // https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels func formatMetricName(metricName string) string { if len(metricName) == 0 { return metricName } newBytes := make([]byte, len(metricName)) for i := 0; i < len(metricName); i++ { b := metricName[i] if !((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b == ':' || (b >= '0' && b <= '9')) { newBytes[i] = MetricNameInvalidByteReplacement } else { newBytes[i] = b } } if '0' <= metricName[0] && metricName[0] <= '9' { return string(MetricNameInvalidByteReplacement) + string(newBytes) } return string(newBytes) }