npm/metrics/prometheus-metrics.go (364 lines of code) (raw):

package metrics import ( "net/http" "github.com/Azure/azure-container-networking/log" "github.com/Azure/azure-container-networking/npm/util" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/klog" ) // Constants for metric names and descriptions as well as exported labels for Vector metrics const ( namespace = "npm" controllerPrefix = "controller" numPoliciesName = "num_policies" numPoliciesHelp = "The number of current network policies for this node" addPolicyExecTimeName = "add_policy_exec_time" addPolicyExecTimeHelp = "Execution time in milliseconds for adding a network policy" numACLRulesName = "num_iptables_rules" numACLRulesHelp = "The number of current IPTable rules for this node" addACLRuleExecTimeName = "add_iptables_rule_exec_time" addACLRuleExecTimeHelp = "Execution time in milliseconds for adding an IPTable rule to a chain" numIPSetsName = "num_ipsets" numIPSetsHelp = "The number of current IP sets for this node" addIPSetExecTimeName = "add_ipset_exec_time" addIPSetExecTimeHelp = "Execution time in milliseconds for creating an IP set" numIPSetEntriesName = "num_ipset_entries" numIPSetEntriesHelp = "The total number of entries in every IPSet" ipsetInventoryName = "ipset_counts" ipsetInventoryHelp = "The number of entries in each individual IPSet" setNameLabel = "set_name" setHashLabel = "set_hash" // perf metrics added after v1.4.16 // all these metrics have "npm_controller_" prepended to their name operationLabel = "operation" hadErrorLabel = "had_error" policyExecTimeName = "policy_exec_time" controllerPolicyExecTimeHelp = "Execution time in milliseconds for updating/deleting a network policy. NOTE: for adding, see npm_add_policy_exec_time" podExecTimeName = "pod_exec_time" controllerPodExecTimeHelp = "Execution time in milliseconds for adding/updating/deleting a pod" namespaceExecTimeName = "namespace_exec_time" controllerNamespaceExecTimeHelp = "Execution time in milliseconds for adding/updating/deleting a namespace" quantileMedian float64 = 0.5 deltaMedian float64 = 0.05 quantile90th float64 = 0.9 delta90th float64 = 0.01 quantil99th float64 = 0.99 delta99th float64 = 0.001 ) // Gauge metrics have the methods Inc(), Dec(), and Set(float64) // Summary metrics have the method Observe(float64) // For any Vector metric, you can call With(prometheus.Labels) before the above methods // e.g. SomeGaugeVec.With(prometheus.Labels{label1: val1, label2: val2, ...).Dec() var ( nodeRegistry = prometheus.NewRegistry() clusterRegistry = prometheus.NewRegistry() haveInitialized = false // quantiles e.g. the "0.5 quantile" with delta 0.05 will actually be the phi quantile for some phi in [0.5 - 0.05, 0.5 + 0.05] execTimeQuantiles = map[float64]float64{quantileMedian: deltaMedian, quantile90th: delta90th, quantil99th: delta99th} numPolicies prometheus.Gauge numACLRules prometheus.Gauge addACLRuleExecTime prometheus.Summary numIPSets prometheus.Gauge addIPSetExecTime prometheus.Summary numIPSetEntries prometheus.Gauge ipsetInventory *prometheus.GaugeVec ipsetInventoryLabels = []string{setNameLabel, setHashLabel} // controller perf metrics // used to be a regular Summary in v1.4.16 and below addPolicyExecTime *prometheus.SummaryVec addPolicyExecTimeLabels = []string{hadErrorLabel} // metrics added after v1.4.16 controllerPolicyExecTime *prometheus.SummaryVec controllerPodExecTime *prometheus.SummaryVec controllerNamespaceExecTime *prometheus.SummaryVec controllerExecTimeLabels = []string{operationLabel, hadErrorLabel} // added in v1.5.4 podsWatched prometheus.Gauge ) // windows metrics added in v1.5.4 const ( windowsPrefix = "windows" isNestedLabel = "is_nested" ) // windows metrics added in v1.5.4 var ( listEndpointsLatency prometheus.Histogram getEndpointLatency prometheus.Histogram getNetworkLatency prometheus.Histogram aclLatency *prometheus.HistogramVec setPolicyLatency *prometheus.HistogramVec listEndpointsFailures prometheus.Counter getEndpointFailures prometheus.Counter getNetworkFailures prometheus.Counter aclFailures *prometheus.CounterVec setPolicyFailures *prometheus.CounterVec ) const linuxPrefix = "linux" // linux metrics added in v1.5.5 var ( itpablesRestoreLatency *prometheus.HistogramVec iptablesDeleteLatency prometheus.Histogram iptablesRestoreFailures *prometheus.CounterVec ) type RegistryType string const ( NodeMetrics RegistryType = "node-metrics" ClusterMetrics RegistryType = "cluster-metrics" ) type OperationKind string const ( CreateOp OperationKind = "create" UpdateOp OperationKind = "update" DeleteOp OperationKind = "delete" NoOp OperationKind = "noop" ) func (op OperationKind) isValid() bool { switch op { case CreateOp, UpdateOp, DeleteOp, NoOp: return true default: return false } } // InitializeAll creates the Controller and Daemon Prometheus Metrics. // The metrics will be nil before this method is called. // TODO consider refactoring the functionality of the metrics package into a "Metrics" struct with methods (this would require code changes throughout npm). // Would need to consider how it seems like you can't register a metric twice, even in a separate registry, so you couldn't throw away the Metrics struct and create a new one. func InitializeAll() { if haveInitialized { klog.Infof("metrics have already been initialized") return } initializeDaemonMetrics() initializeControllerMetrics() podsWatched = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: namespace, Name: "pods_watched", Subsystem: "", Help: "Number of Pods NPM tracks across the cluster including Linux and Windows nodes", }, ) register(podsWatched, "pods_watched", ClusterMetrics) if util.IsWindowsDP() { InitializeWindowsMetrics() klog.Infof("registering windows metrics") register(listEndpointsLatency, "list_endpoints_latency_seconds", NodeMetrics) register(getEndpointLatency, "get_endpoint_latency_seconds", NodeMetrics) register(getNetworkLatency, "get_network_latency_seconds", NodeMetrics) register(aclLatency, "acl_latency_seconds", NodeMetrics) register(setPolicyLatency, "setpolicy_latency_seconds", NodeMetrics) register(listEndpointsFailures, "list_endpoints_failure_total", NodeMetrics) register(getEndpointFailures, "get_endpoint_failure_total", NodeMetrics) register(getNetworkFailures, "get_network_failure_total", NodeMetrics) register(aclFailures, "acl_failure_total", NodeMetrics) register(setPolicyFailures, "setpolicy_failure_total", NodeMetrics) } else { InitializeLinuxMetrics() klog.Infof("registering linux metrics") register(itpablesRestoreLatency, "iptables_restore_latency_seconds", NodeMetrics) register(iptablesDeleteLatency, "iptables_delete_latency_seconds", NodeMetrics) register(iptablesRestoreFailures, "iptables_restore_failure_total", NodeMetrics) } log.Logf("Finished initializing all Prometheus metrics") haveInitialized = true } // ReinitializeAll creates/replaces Prometheus metrics. // This function is intended for UTs. func ReinitializeAll() { klog.Infof("reinitializing Prometheus metrics. This may cause error messages of the form: 'error creating metric' from trying to re-register each metric") haveInitialized = false InitializeAll() } // InitializeWindowsMetrics should NOT be called externally except for resetting metrics for UTs. func InitializeWindowsMetrics() { klog.Infof("initializing Windows metrics. will not register the newly created metrics in this function") listEndpointsLatency = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, Name: "list_endpoints_latency_seconds", Subsystem: windowsPrefix, Help: "Latency in seconds to list HNS endpoints latency", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.008, 2, 14), // upper bounds of 8 ms to 65 seconds }, ) getEndpointLatency = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, Name: "get_endpoint_latency_seconds", Subsystem: windowsPrefix, Help: "Latency in seconds to get a single HNS endpoint", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.008, 2, 14), // upper bounds of 8 ms to 65 seconds }, ) getNetworkLatency = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, Name: "get_network_latency_seconds", Subsystem: windowsPrefix, Help: "Latency in seconds to get the HNS network", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.008, 2, 14), // upper bounds of 8 ms to 65 seconds }, ) aclLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Name: "acl_latency_seconds", Subsystem: windowsPrefix, Help: "Latency in seconds to add/update ACLs by operation label", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.008, 2, 14), // upper bounds of 8 ms to 65 seconds }, []string{operationLabel}, ) setPolicyLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Name: "setpolicy_latency_seconds", Subsystem: windowsPrefix, Help: "Latency in seconds to add/update/delete SetPolicies by operation & is_nested label", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.008, 2, 14), // upper bounds of 8 ms to 65 seconds }, []string{operationLabel, isNestedLabel}, ) listEndpointsFailures = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: namespace, Name: "list_endpoints_failure_total", Subsystem: windowsPrefix, Help: "Number of failures while listing HNS endpoints", }, ) getEndpointFailures = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: namespace, Name: "get_endpoint_failure_total", Subsystem: windowsPrefix, Help: "Number of failures while getting a single HNS endpoint", }, ) getNetworkFailures = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: namespace, Name: "get_network_failure_total", Subsystem: windowsPrefix, Help: "Number of failures while getting the HNS network", }, ) aclFailures = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Name: "acl_failure_total", Subsystem: windowsPrefix, Help: "Number of failures while adding/updating ACLs by operation label", }, []string{operationLabel}, ) setPolicyFailures = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Name: "setpolicy_failure_total", Subsystem: windowsPrefix, Help: "Number of failures while adding/updating/deleting SetPolicies by operation & is_nested label", }, []string{operationLabel, isNestedLabel}, ) } func InitializeLinuxMetrics() { klog.Infof("initializing Linux metrics. will not register the newly created metrics in this function") itpablesRestoreLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Name: "iptables_restore_latency_seconds", Subsystem: linuxPrefix, Help: "Latency in seconds to restore iptables rules by operation label (add/delete NetPol)", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.016, 2, 14), // upper bounds of 16 ms to ~2 minutes }, []string{operationLabel}, ) iptablesDeleteLatency = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: namespace, Name: "iptables_delete_latency_seconds", Subsystem: linuxPrefix, Help: "Latency in seconds to delete an iptables rule", //nolint:gomnd // default bucket consts Buckets: prometheus.ExponentialBuckets(0.016, 2, 14), // upper bounds of 16 ms to ~2 minutes }, ) iptablesRestoreFailures = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Name: "iptables_restore_failure_total", Subsystem: linuxPrefix, Help: "Number of failures while restoring iptable rules by operation label (add/delete NetPol)", }, []string{operationLabel}, ) } // GetHandler returns the HTTP handler for the metrics endpoint func GetHandler(registryType RegistryType) http.Handler { if !haveInitialized { // not sure if this will ever happen, but just in case klog.Infof("in GetHandler, metrics weren't initialized. Initializing now") InitializeAll() } return promhttp.HandlerFor(getRegistry(registryType), promhttp.HandlerOpts{}) } // initializeDaemonMetrics creates non-controller metrics func initializeDaemonMetrics() { // CLUSTER METRICS numACLRules = createClusterGauge(numACLRulesName, numACLRulesHelp) numIPSets = createClusterGauge(numIPSetsName, numIPSetsHelp) numIPSetEntries = createClusterGauge(numIPSetEntriesName, numIPSetEntriesHelp) ipsetInventory = createClusterGaugeVec(ipsetInventoryName, ipsetInventoryHelp, ipsetInventoryLabels) ipsetInventoryMap = make(map[string]int) // NODE METRICS addACLRuleExecTime = createNodeSummary(addACLRuleExecTimeName, addACLRuleExecTimeHelp) addIPSetExecTime = createNodeSummary(addIPSetExecTimeName, addIPSetExecTimeHelp) } // initializeControllerMetrics creates metrics modified by the controller func initializeControllerMetrics() { // CLUSTER METRICS numPolicies = createClusterGauge(numPoliciesName, numPoliciesHelp) // NODE METRICS addPolicyExecTime = createNodeSummaryVec(addPolicyExecTimeName, "", addPolicyExecTimeHelp, addPolicyExecTimeLabels) // perf metrics added after v1.4.16 // all these metrics have "npm_controller_" prepended to their name controllerPolicyExecTime = createControllerExecTimeSummaryVec(policyExecTimeName, controllerPolicyExecTimeHelp) controllerPodExecTime = createControllerExecTimeSummaryVec(podExecTimeName, controllerPodExecTimeHelp) controllerNamespaceExecTime = createControllerExecTimeSummaryVec(namespaceExecTimeName, controllerNamespaceExecTimeHelp) } func register(collector prometheus.Collector, name string, registryType RegistryType) { err := getRegistry(registryType).Register(collector) if err != nil { log.Errorf("Error creating metric %s", name) } else { klog.Infof("registered metric %s to registry %s", name, registryType) } } func getRegistry(registryType RegistryType) *prometheus.Registry { if registryType == NodeMetrics { return nodeRegistry } return clusterRegistry } func createClusterGauge(name, helpMessage string) prometheus.Gauge { gauge := prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: namespace, Name: name, Help: helpMessage, }, ) register(gauge, name, ClusterMetrics) return gauge } func createClusterGaugeVec(name, helpMessage string, labels []string) *prometheus.GaugeVec { gaugeVec := prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, Name: name, Help: helpMessage, }, labels, ) register(gaugeVec, name, ClusterMetrics) return gaugeVec } func createNodeSummary(name, helpMessage string) prometheus.Summary { // uses default observation TTL of 10 minutes summary := prometheus.NewSummary( prometheus.SummaryOpts{ Namespace: namespace, Name: name, Help: helpMessage, Objectives: execTimeQuantiles, }, ) register(summary, name, NodeMetrics) return summary } func createNodeSummaryVec(name, subsystem, helpMessage string, labels []string) *prometheus.SummaryVec { // uses default observation TTL of 10 minutes summary := prometheus.NewSummaryVec( prometheus.SummaryOpts{ Namespace: namespace, Subsystem: subsystem, Name: name, Help: helpMessage, Objectives: execTimeQuantiles, }, labels, ) register(summary, name, NodeMetrics) return summary } func createControllerExecTimeSummaryVec(name, helpMessage string) *prometheus.SummaryVec { return createNodeSummaryVec(name, controllerPrefix, helpMessage, controllerExecTimeLabels) }