registry/datastore/metrics/metrics.go (244 lines of code) (raw):

package metrics import ( "strconv" "time" "github.com/docker/distribution/metrics" "github.com/prometheus/client_golang/prometheus" ) var ( queryDurationHist *prometheus.HistogramVec queryTotal *prometheus.CounterVec timeSince = time.Since // for test purposes only lbPoolSize prometheus.Gauge lbLSNCacheOpDuration *prometheus.HistogramVec lbLSNCacheHits *prometheus.CounterVec lbDNSLookupDurationHist *prometheus.HistogramVec lbPoolEvents *prometheus.CounterVec lbTargets *prometheus.CounterVec lbLagBytes *prometheus.GaugeVec lbLagSeconds *prometheus.HistogramVec ) const ( subsystem = "database" queryNameLabel = "name" errorLabel = "error" replicaLabel = "replica" queryDurationName = "query_duration_seconds" queryDurationDesc = "A histogram of latencies for database queries." queryTotalName = "queries_total" queryTotalDesc = "A counter for database queries." lbPoolSizeName = "lb_pool_size" lbPoolSizeDesc = "A gauge for the current number of replicas in the load balancer pool." lbLSNCacheOpDurationName = "lb_lsn_cache_operation_duration_seconds" lbLSNCacheOpDurationDesc = "A histogram of latencies for database load balancing LSN cache operations." lbLSNCacheOpLabel = "operation" lbLSNCacheOpSet = "set" lbLSNCacheOpGet = "get" lbLSNCacheHitsName = "lb_lsn_cache_hits_total" lbLSNCacheHitsDesc = "A counter for database load balancing LSN cache hits and misses." lbLSNCacheResultLabel = "result" lbLSNCacheResultHit = "hit" lbLSNCacheResultMiss = "miss" lbDNSLookupDurationName = "lb_lookup_seconds" lbDNSLookupDurationDesc = "A histogram of latencies for database load balancing DNS lookups." lookupTypeLabel = "lookup_type" srvLookupType = "srv" hostLookupType = "host" lbPoolEventsName = "lb_pool_events_total" lbPoolEventsDesc = "A counter of replicas added or removed from the database load balancer pool." lbPoolEventsEventLabel = "event" lbPoolEventsReplicaAdded = "replica_added" lbPoolEventsReplicaRemoved = "replica_removed" lbTargetsName = "lb_targets_total" lbTargetsDesc = "A counter for primary and replica target elections during database load balancing." lbTargetTypeLabel = "target_type" lbFallbackLabel = "fallback" lbPrimaryType = "primary" lbReplicaType = "replica" lbReasonLabel = "reason" lbFallbackNoCache = "no_cache" lbFallbackNoReplica = "no_replica" lbFallbackError = "error" lbFallbackNotUpToDate = "not_up_to_date" lbReasonSelected = "selected" lbLagBytesName = "lb_lag_bytes" lbLagBytesDesc = "A gauge for the replication lag in bytes for each replica." lbLagSecondsName = "lb_lag_seconds" lbLagSecondsDesc = "A histogram of replication lag in seconds for each replica." ) func init() { queryDurationHist = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: queryDurationName, Help: queryDurationDesc, Buckets: prometheus.DefBuckets, }, []string{queryNameLabel}, ) queryTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: queryTotalName, Help: queryTotalDesc, }, []string{queryNameLabel}, ) lbPoolSize = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbPoolSizeName, Help: lbPoolSizeDesc, }) lbLSNCacheOpDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbLSNCacheOpDurationName, Help: lbLSNCacheOpDurationDesc, Buckets: prometheus.DefBuckets, }, []string{lbLSNCacheOpLabel, errorLabel}, ) lbLSNCacheHits = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbLSNCacheHitsName, Help: lbLSNCacheHitsDesc, }, []string{lbLSNCacheResultLabel}, ) lbDNSLookupDurationHist = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbDNSLookupDurationName, Help: lbDNSLookupDurationDesc, Buckets: prometheus.DefBuckets, }, []string{lookupTypeLabel, errorLabel}, ) lbPoolEvents = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbPoolEventsName, Help: lbPoolEventsDesc, }, []string{lbPoolEventsEventLabel}, ) lbTargets = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbTargetsName, Help: lbTargetsDesc, }, []string{lbTargetTypeLabel, lbFallbackLabel, lbReasonLabel}, ) lbLagBytes = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbLagBytesName, Help: lbLagBytesDesc, }, []string{replicaLabel}, ) lbLagSeconds = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metrics.NamespacePrefix, Subsystem: subsystem, Name: lbLagSecondsName, Help: lbLagSecondsDesc, Buckets: []float64{0.001, 0.01, 0.1, 0.5, 1, 5, 10, 20, 30, 60}, // 1ms to 60s }, []string{replicaLabel}, ) prometheus.MustRegister(queryDurationHist) prometheus.MustRegister(queryTotal) prometheus.MustRegister(lbPoolSize) prometheus.MustRegister(lbLSNCacheOpDuration) prometheus.MustRegister(lbLSNCacheHits) prometheus.MustRegister(lbDNSLookupDurationHist) prometheus.MustRegister(lbPoolEvents) prometheus.MustRegister(lbTargets) prometheus.MustRegister(lbLagBytes) prometheus.MustRegister(lbLagSeconds) } func InstrumentQuery(name string) func() { start := time.Now() return func() { queryTotal.WithLabelValues(name).Inc() queryDurationHist.WithLabelValues(name).Observe(timeSince(start).Seconds()) } } // ReplicaPoolSize captures the current number of replicas in the load balancer pool. func ReplicaPoolSize(size int) { lbPoolSize.Set(float64(size)) } func lsnCacheOperation(operation string) func(error) { start := time.Now() return func(err error) { failed := strconv.FormatBool(err != nil) lbLSNCacheOpDuration.WithLabelValues(operation, failed).Observe(timeSince(start).Seconds()) } } // LSNCacheGet captures the duration and result of load balancing LSN get operations. func LSNCacheGet() func(error) { return lsnCacheOperation(lbLSNCacheOpGet) } // LSNCacheSet captures the duration and result of load balancing LSN set operations. func LSNCacheSet() func(error) { return lsnCacheOperation(lbLSNCacheOpSet) } // LSNCacheHit increments the load balancing LSN cache hit counter. func LSNCacheHit() { lbLSNCacheHits.WithLabelValues(lbLSNCacheResultHit).Inc() } // LSNCacheMiss increments the load balancing LSN cache miss counter. func LSNCacheMiss() { lbLSNCacheHits.WithLabelValues(lbLSNCacheResultMiss).Inc() } func dnsLookup(lookupType string) func(error) { start := time.Now() return func(err error) { failed := strconv.FormatBool(err != nil) lbDNSLookupDurationHist.WithLabelValues(lookupType, failed).Observe(timeSince(start).Seconds()) } } // SRVLookup returns a function that can be used to instrument the count and duration of DNS SRV record lookups during // database load balancing. func SRVLookup() func(error) { return dnsLookup(srvLookupType) } // HostLookup returns a function that can be used to instrument the count and duration of DNS host lookups during // database load balancing. func HostLookup() func(error) { return dnsLookup(hostLookupType) } // ReplicaAdded increments the counter for load balancing replicas added to the pool. func ReplicaAdded() { lbPoolEvents.WithLabelValues(lbPoolEventsReplicaAdded).Inc() } // ReplicaRemoved increments the counter for load balancing replicas removed from the pool. func ReplicaRemoved() { lbPoolEvents.WithLabelValues(lbPoolEventsReplicaRemoved).Inc() } // PrimaryTarget increments the counter for primary targets selected during load balancing. // This method is used when the primary is selected as the intended target, not as a fallback. func PrimaryTarget() { lbTargets.WithLabelValues(lbPrimaryType, "false", lbReasonSelected).Inc() } // PrimaryFallbackNoCache increments the counter for primary targets selected during load balancing // as a fallback due to the absence of an LSN cache. func PrimaryFallbackNoCache() { lbTargets.WithLabelValues(lbPrimaryType, "true", lbFallbackNoCache).Inc() } // PrimaryFallbackNoReplica increments the counter for primary targets selected during load balancing // as a fallback due to no replicas being available. func PrimaryFallbackNoReplica() { lbTargets.WithLabelValues(lbPrimaryType, "true", lbFallbackNoReplica).Inc() } // PrimaryFallbackError increments the counter for primary targets selected during load balancing // as a fallback due to an error. func PrimaryFallbackError() { lbTargets.WithLabelValues(lbPrimaryType, "true", lbFallbackError).Inc() } // PrimaryFallbackNotUpToDate increments the counter for primary targets selected during load balancing // as a fallback because the selected replica is not up-to-date with the primary. func PrimaryFallbackNotUpToDate() { lbTargets.WithLabelValues(lbPrimaryType, "true", lbFallbackNotUpToDate).Inc() } // ReplicaTarget increments the counter for replica targets successfully selected during load balancing. func ReplicaTarget() { lbTargets.WithLabelValues(lbReplicaType, "false", lbReasonSelected).Inc() } // ReplicaLagBytes records the byte lag for a replica. func ReplicaLagBytes(replicaAddr string, bytes float64) { lbLagBytes.WithLabelValues(replicaAddr).Set(bytes) } // ReplicaLagSeconds records the time lag for a replica in seconds. func ReplicaLagSeconds(replicaAddr string, seconds float64) { lbLagSeconds.WithLabelValues(replicaAddr).Observe(seconds) }