mixins/kubernetes/tests.yaml (1,182 lines of code) (raw):

rule_files: - prometheus_alerts.yaml - prometheus_rules.yaml evaluation_interval: 1m tests: # PersistentVolume disk space - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeFillingUp - eval_time: 2m alertname: KubePersistentVolumeFillingUp - eval_time: 3m alertname: KubePersistentVolumeFillingUp - eval_time: 4m alertname: KubePersistentVolumeFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolume is filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes is only 1.562% free.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup # Don't alert when PVC access_mode is ReadOnlyMany - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadOnlyMany", service="kube-state-metrics"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeFillingUp - eval_time: 2m alertname: KubePersistentVolumeFillingUp - eval_time: 3m alertname: KubePersistentVolumeFillingUp - eval_time: 4m alertname: KubePersistentVolumeFillingUp # Block volume mounts can report 0 for the kubelet_volume_stats_used_bytes metric but it shouldn't trigger the KubePersistentVolumeFillingUp alert. # See https://github.com/kubernetes/kubernetes/commit/b997e0e4d6ccbead435a47d6ac75b0db3d17252f for details. - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '0 0 0 0' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeFillingUp - eval_time: 2m alertname: KubePersistentVolumeFillingUp - eval_time: 3m alertname: KubePersistentVolumeFillingUp - eval_time: 4m alertname: KubePersistentVolumeFillingUp # Don't alert when PVC has been labelled as fully utilised - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1 1 1 1' - series: 'kube_persistentvolumeclaim_labels{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc",label_excluded_from_alerts="true"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeFillingUp - eval_time: 2m alertname: KubePersistentVolumeFillingUp - eval_time: 3m alertname: KubePersistentVolumeFillingUp - eval_time: 4m alertname: KubePersistentVolumeFillingUp - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' alert_rule_test: - eval_time: 1h alertname: KubePersistentVolumeFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolume is filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes is only 1.294% free.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: warning exp_annotations: summary: "PersistentVolume is filling up." description: 'Based on recent sampling, the PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes is expected to fill up within four days. Currently 1.263% is available.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolume is filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes is only 1.263% free.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup # Block volume mounts can report 0 for the kubelet_volume_stats_used_bytes metric but it shouldn't trigger the KubePersistentVolumeFillingUp alert. # See https://github.com/kubernetes/kubernetes/commit/b997e0e4d6ccbead435a47d6ac75b0db3d17252f for details. - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '0x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeFillingUp # Don't alert when PVC access_mode is ReadOnlyMany - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadOnlyMany", service="kube-state-metrics"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeFillingUp - interval: 1m input_series: - series: 'kubelet_volume_stats_available_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1x61' - series: 'kube_persistentvolumeclaim_labels{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc",label_excluded_from_alerts="true"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeFillingUp # PersistentVolume inodes - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 2m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 3m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 4m alertname: KubePersistentVolumeInodesFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolumeInodes are filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes only has 1.562% free inodes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup # Don't alert when PVC access_mode is ReadOnlyMany - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadOnlyMany", service="kube-state-metrics"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 2m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 3m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 4m alertname: KubePersistentVolumeInodesFillingUp # Block volume mounts can report 0 for the kubelet_volume_stats_inodes_used metric but it shouldn't trigger the KubePersistentVolumeInodesFillingUp alert. # See https://github.com/kubernetes/kubernetes/commit/b997e0e4d6ccbead435a47d6ac75b0db3d17252f for details. - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '0 0 0 0' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 2m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 3m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 4m alertname: KubePersistentVolumeInodesFillingUp # Don't alert when PVC has been labelled as fully utilised - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 512 64 16' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024 1024 1024 1024' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '16 64 512 1024' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1 1 1 1' - series: 'kube_persistentvolumeclaim_labels{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc",label_excluded_from_alerts="true"}' values: '1 1 1 1' alert_rule_test: - eval_time: 1m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 2m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 3m alertname: KubePersistentVolumeInodesFillingUp - eval_time: 4m alertname: KubePersistentVolumeInodesFillingUp - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' alert_rule_test: - eval_time: 1h alertname: KubePersistentVolumeInodesFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolumeInodes are filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes only has 1.294% free inodes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: warning exp_annotations: summary: "PersistentVolumeInodes are filling up." description: 'Based on recent sampling, the PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes is expected to run out of inodes within four days. Currently 1.263% of its inodes are free.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes persistentvolumeclaim: somepvc severity: critical exp_annotations: summary: "PersistentVolumeInodes are filling up." description: 'The PersistentVolume claimed by somepvc in Namespace monitoring on Cluster kubernetes only has 1.263% free inodes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup # Block volume mounts can report 0 for the kubelet_volume_stats_inodes_used metric but it shouldn't trigger the KubePersistentVolumeInodesFillingUp alert. # See https://github.com/kubernetes/kubernetes/commit/b997e0e4d6ccbead435a47d6ac75b0db3d17252f for details. - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_inodes_used{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '0x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp # Don't alert when PVC access_mode is ReadOnlyMany - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_inodes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadOnlyMany", service="kube-state-metrics"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp - interval: 1m input_series: - series: 'kubelet_volume_stats_inodes_free{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024-10x61' - series: 'kubelet_volume_stats_capacity_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '32768+0x61' - series: 'kubelet_volume_stats_used_bytes{job="kubelet",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc"}' values: '1024+10x61' - series: 'kube_persistentvolumeclaim_access_mode{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc", access_mode="ReadWriteOnce", service="kube-state-metrics"}' values: '1x61' - series: 'kube_persistentvolumeclaim_labels{job="ksm",cluster="kubernetes",namespace="monitoring",persistentvolumeclaim="somepvc",label_excluded_from_alerts="true"}' values: '1x61' alert_rule_test: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp - interval: 1m input_series: - series: 'kube_node_status_capacity{resource="pods",instance="172.17.0.5:8443",cluster="kubernetes",node="minikube",job="kube-state-metrics",namespace="kube-system"}' values: '3+0x15' - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-1",service="kube-state-metrics"}' values: '1+0x15' - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-1",service="kube-state-metrics"}' values: '1+0x15' - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-2",service="kube-state-metrics"}' values: '1+0x15' - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-2",service="kube-state-metrics"}' values: '1+0x15' - series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-3",service="kube-state-metrics"}' values: '1+0x15' - series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-3",service="kube-state-metrics"}' values: '1+0x15' alert_rule_test: - eval_time: 10m alertname: KubeletTooManyPods - eval_time: 15m alertname: KubeletTooManyPods exp_alerts: - exp_labels: cluster: kubernetes node: minikube severity: info exp_annotations: summary: "Kubelet is running at capacity." description: "Kubelet 'minikube' is running at 100% of its Pod capacity." runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - interval: 1m input_series: - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.15+0x10' - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-2",service="ksm"}' values: '0.15+0x10' - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.1+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '1E9+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-2",service="ksm"}' values: '1E9+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.5E9+0x10' # Duplicate kube_pod_status_phase timeseries for the same pod. - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Running",pod="pod-1",service="ksm"}' values: '1 stale' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Pending",pod="pod-1",service="ksm"}' values: '1+0x10' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Completed",pod="pod-2",service="ksm"}' values: '1+0x10' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Running",pod="pod-1",service="ksm"}' values: '1+0x10' promql_expr_test: - eval_time: 0m expr: namespace_cpu:kube_pod_container_resource_requests:sum exp_samples: - value: 0.15 labels: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 0m expr: namespace_memory:kube_pod_container_resource_requests:sum exp_samples: - value: 1.0e+9 labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_cpu:kube_pod_container_resource_requests:sum exp_samples: - value: 0.15 labels: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_memory:kube_pod_container_resource_requests:sum exp_samples: - value: 1.0e+9 labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - interval: 1m input_series: - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.15+0x10' - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-2",service="ksm"}' values: '0.15+0x10' - series: 'kube_pod_container_resource_requests{resource="cpu",container="kube-apiserver-67",endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.1+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '1E9+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-2",service="ksm"}' values: '1E9+0x10' - series: 'kube_pod_container_resource_requests{resource="memory",container="kube-apiserver-67",endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",node="node-1",pod="pod-1",service="ksm"}' values: '0.5E9+0x10' # Duplicate kube_pod_status_phase timeseries for the same pod. - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Running",pod="pod-1",service="ksm"}' values: '1 stale' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Pending",pod="pod-1",service="ksm"}' values: '1+0x10' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-1",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Completed",pod="pod-2",service="ksm"}' values: '1+0x10' - series: 'kube_pod_status_phase{endpoint="https-main",instance="ksm-2",job="kube-state-metrics",cluster="kubernetes",namespace="kube-apiserver",phase="Running",pod="pod-1",service="ksm"}' values: '1+0x10' promql_expr_test: - eval_time: 0m expr: namespace_cpu:kube_pod_container_resource_requests:sum exp_samples: - value: 0.15 labels: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 0m expr: namespace_memory:kube_pod_container_resource_requests:sum exp_samples: - value: 1.0e+9 labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_cpu:kube_pod_container_resource_requests:sum exp_samples: - value: 0.15 labels: 'namespace_cpu:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_memory:kube_pod_container_resource_requests:sum exp_samples: - value: 1.0e+9 labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - interval: 1m input_series: # Create a histogram where all of the last 10 samples are in the +Inf (> 10 seconds) bucket. - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.005", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.01", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.025", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.05", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.1", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.25", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.5", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="1", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="2.5", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="5", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="10", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '1+0x10' - series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="+Inf", cluster="kubernetes",instance="10.0.2.15:10250"}' values: '30+1x10' - series: 'kubelet_node_name{endpoint="https-metrics",cluster="kubernetes",instance="10.0.2.15:10250",job="kubelet",namespace="kube-system",node="minikube",service="kubelet"}' values: '1 1 1 1 1 1 1 1 1 1' alert_rule_test: - eval_time: 10m alertname: KubeletPlegDurationHigh exp_alerts: - exp_labels: cluster: "kubernetes" instance: 10.0.2.15:10250 node: minikube quantile: 0.99 severity: warning exp_annotations: summary: "Kubelet Pod Lifecycle Event Generator is taking too long to relist." description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of 10 seconds on node minikube.' runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh' - interval: 1m input_series: - series: 'kube_node_status_condition{condition="Ready",endpoint="https-main",cluster="kubernetes",instance="10.0.2.15:10250",job="kube-state-metrics",namespace="monitoring",node="minikube",pod="kube-state-metrics-b894d84cc-d6htw",service="kube-state-metrics",status="true"}' values: '1 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1' alert_rule_test: - eval_time: 18m alertname: KubeNodeReadinessFlapping exp_alerts: - exp_labels: cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Node readiness status is flapping." description: 'The readiness status of node minikube has changed 10 times in the last 15 minutes.' runbook_url: 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping' # Verify that node:node_num_cpu:sum triggers no many-to-many errors. - interval: 1m input_series: - series: 'node_cpu_seconds_total{cpu="0",endpoint="https",instance="instance1",job="node",mode="idle",cluster="kubernetes",namespace="openshift-monitoring",pod="node-exporter-1",service="node-exporter"}' values: '1 1' - series: 'node_cpu_seconds_total{cpu="1",endpoint="https",instance="instance1",job="node",mode="idle",cluster="kubernetes",namespace="openshift-monitoring",pod="node-exporter-1",service="node-exporter"}' values: '1 1' - series: 'kube_pod_info{cluster="kubernetes",namespace="openshift-monitoring",node="node-1",pod="node-exporter-1",job="kube-state-metrics",instance="10.129.2.7:8443"}' values: '1 1' - series: 'kube_pod_info{cluster="kubernetes",namespace="openshift-monitoring",node="node-1",pod="alertmanager-0",job="kube-state-metrics",instance="10.129.2.7:8443"}' values: '1 stale' - series: 'kube_pod_info{cluster="kubernetes",namespace="openshift-monitoring",node="node-2",pod="alertmanager-0",job="kube-state-metrics",instance="10.129.2.7:8443"}' values: '1 1' promql_expr_test: - eval_time: 0m expr: node:node_num_cpu:sum exp_samples: - value: 2 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-1"}' - eval_time: 1m expr: node:node_num_cpu:sum exp_samples: - value: 2 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-1"}' # Verify that node:node_num_cpu:sum doesn't trigger many-to-many errors when # node_namespace_pod:kube_pod_info: has duplicate entries for the same # (namespace,pod) tuple. This can happen when Prometheus is restarted because # it didn't add stale markers to the "old" series on shutdown. - interval: 1m input_series: - series: 'node_cpu_seconds_total{cpu="0",endpoint="https",instance="instance1",job="node",mode="idle",cluster="kubernetes",namespace="openshift-monitoring",pod="node-exporter-1",service="node-exporter"}' values: '1 1' - series: 'node_cpu_seconds_total{cpu="0",endpoint="https",instance="instance2",job="node",mode="idle",cluster="kubernetes",namespace="openshift-monitoring",pod="node-exporter-2",service="node-exporter"}' values: '1 1' - series: 'node_namespace_pod:kube_pod_info:{cluster="kubernetes",node="node-1",namespace="openshift-monitoring",pod="node-exporter-1"}' values: '1 1' - series: 'node_namespace_pod:kube_pod_info:{cluster="kubernetes",node="node-2",namespace="openshift-monitoring",pod="node-exporter-2"}' values: '1 1' # series for the "old" prometheus instance. - series: 'node_namespace_pod:kube_pod_info:{cluster="kubernetes",node="node-1",namespace="openshift-monitoring",pod="prometheus-0"}' values: '1' # series for the "new" prometheus instance. - series: 'node_namespace_pod:kube_pod_info:{cluster="kubernetes",node="node-2",namespace="openshift-monitoring",pod="prometheus-0"}' values: 'stale 1' promql_expr_test: - eval_time: 0m expr: node:node_num_cpu:sum exp_samples: - value: 1 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-1"}' - value: 1 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-2"}' - eval_time: 1m expr: node:node_num_cpu:sum exp_samples: - value: 1 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-1"}' - value: 1 labels: 'node:node_num_cpu:sum{cluster="kubernetes",node="node-2"}' - interval: 1m input_series: - series: 'kube_pod_owner{endpoint="https",instance="instance1",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="true",owner_kind="ReplicaSet",owner_name="ds-7cc77d965f",pod="ds-7cc77d965f-cgsdv",service="ksm"}' values: '1 1' - series: 'kube_pod_owner{endpoint="https",instance="instance2",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="true",owner_kind="ReplicaSet",owner_name="ds-7cc77d965f",pod="ds-7cc77d965f-cgsdv",service="ksm"}' values: '1 stale' - series: 'kube_replicaset_owner{endpoint="https",instance="instance1",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="true",owner_kind="Deployment",owner_name="ds",pod="ds-777f6bf798-kq7tj",replicaset="ds-7cc77d965f",service="ksm"}' values: '1 1' - series: 'kube_replicaset_owner{endpoint="https",instance="instance2",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="true",owner_kind="Deployment",owner_name="ds",pod="ds-777f6bf798-kq7tj",replicaset="ds-7cc77d965f",service="ksm"}' values: '1 stale' promql_expr_test: - eval_time: 0m expr: namespace_workload_pod:kube_pod_owner:relabel exp_samples: - value: 1 labels: 'namespace_workload_pod:kube_pod_owner:relabel{cluster="kubernetes",namespace="ns1", pod="ds-7cc77d965f-cgsdv", workload="ds", workload_type="deployment"}' - eval_time: 1m expr: namespace_workload_pod:kube_pod_owner:relabel exp_samples: - value: 1 labels: 'namespace_workload_pod:kube_pod_owner:relabel{cluster="kubernetes",namespace="ns1", pod="ds-7cc77d965f-cgsdv", workload="ds", workload_type="deployment"}' - interval: 1m input_series: - series: 'kube_pod_status_phase{endpoint="https",instance="instance1",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",phase="Pending",pod="pod-ds-7cc77d965f-cgsdv",service="ksm"}' values: '1+0x20' - series: 'kube_pod_owner{endpoint="https",instance="instance1",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="false",owner_kind="<None>",owner_name="ds-7cc77d965f",pod="pod-ds-7cc77d965f-cgsdv",service="ksm"}' values: '1+0x20' - series: 'kube_pod_owner{endpoint="https",instance="instance1",job="kube-state-metrics",cluster="kubernetes",namespace="ns1",owner_is_controller="true",owner_kind="ReplicaSet",owner_name="ds-7cc77d965f",pod="pod-ds-7cc77d965f-cgsdv",service="ksm"}' values: '1+0x20' alert_rule_test: - eval_time: 15m alertname: KubePodNotReady exp_alerts: - exp_labels: cluster: kubernetes namespace: ns1 pod: pod-ds-7cc77d965f-cgsdv severity: warning exp_annotations: summary: "Pod has been in a non-ready state for more than 15 minutes." description: "Pod ns1/pod-ds-7cc77d965f-cgsdv has been in a non-ready state for longer than 15 minutes." runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready" - interval: 1m input_series: - series: 'container_cpu_usage_seconds_total{container="alertmanager",cpu="total",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-0",service="kubelet"}' values: '0+3x5' - series: 'container_cpu_usage_seconds_total{container="alertmanager",cpu="total",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",service="kubelet"}' values: '0+3x5' # Duplicate timeseries from different instances. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance2"}' values: '1+0x5' # Missing node label. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' promql_expr_test: - eval_time: 5m expr: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate exp_samples: - value: 5.0e-2 labels: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster="kubernetes",namespace="monitoring", pod="alertmanager-main-0", container="alertmanager",node="node1"}' - interval: 1m input_series: - series: 'container_memory_working_set_bytes{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-0",service="kubelet"}' values: '1000+0x5' - series: 'container_memory_working_set_bytes{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",service="kubelet"}' values: '1000+0x5' # Duplicate timeseries from different instances. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance2"}' values: '1+0x5' # Missing node label. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' promql_expr_test: - eval_time: 5m expr: node_namespace_pod_container:container_memory_working_set_bytes exp_samples: - value: 1.0e+3 labels: 'node_namespace_pod_container:container_memory_working_set_bytes{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",service="kubelet"}' - interval: 1m input_series: - series: 'container_memory_rss{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-0",service="kubelet"}' values: '1000+0x5' - series: 'container_memory_rss{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",service="kubelet"}' values: '1000+0x5' # Duplicate timeseries from different instances. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance2"}' values: '1+0x5' # Missing node label. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' promql_expr_test: - eval_time: 5m expr: node_namespace_pod_container:container_memory_rss exp_samples: - value: 1.0e+3 labels: 'node_namespace_pod_container:container_memory_rss{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",service="kubelet"}' - interval: 1m input_series: - series: 'container_memory_cache{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-0",service="kubelet"}' values: '1000+0x5' - series: 'container_memory_cache{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",service="kubelet"}' values: '1000+0x5' # Duplicate timeseries from different instances. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance2"}' values: '1+0x5' # Missing node label. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' promql_expr_test: - eval_time: 5m expr: node_namespace_pod_container:container_memory_cache exp_samples: - value: 1.0e+3 labels: 'node_namespace_pod_container:container_memory_cache{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",service="kubelet"}' - interval: 1m input_series: - series: 'container_memory_swap{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-0",service="kubelet"}' values: '1000+0x5' - series: 'container_memory_swap{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",service="kubelet"}' values: '1000+0x5' # Duplicate timeseries from different instances. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",job="kube-state-metrics",instance="instance2"}' values: '1+0x5' # Missing node label. - series: 'kube_pod_info{cluster="kubernetes",namespace="monitoring",pod="alertmanager-main-1",job="kube-state-metrics",instance="instance1"}' values: '1+0x5' promql_expr_test: - eval_time: 5m expr: node_namespace_pod_container:container_memory_swap exp_samples: - value: 1.0e+3 labels: 'node_namespace_pod_container:container_memory_swap{container="alertmanager",endpoint="https",id="/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod3426a9c5_53d6_4736_9ca8_f575828e3e4b.slice/crio-f0d7fb2c909605aad16946ff065a42b25cdcdb812459e712ecdd6bce8a3ed6cb.scope",image="quay.io/prometheus/alertmanager:latest",instance="instance1",job="cadvisor",name="name1",cluster="kubernetes",namespace="monitoring",node="node1",pod="alertmanager-main-0",service="kubelet"}' - interval: 1m # Current unequal desired and not progressing. input_series: - series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4' - series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - series: 'kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4' - series: 'kube_daemonset_status_number_available{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4' alert_rule_test: - eval_time: 32m alertname: KubeDaemonSetRolloutStuck - eval_time: 33m alertname: KubeDaemonSetRolloutStuck exp_alerts: - exp_labels: job: kube-state-metrics namespace: monitoring cluster: kubernetes daemonset: node-exporter severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck # KubeDeploymentRolloutStuck - interval: 1m input_series: - series: 'kube_deployment_status_condition{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",deployment="stuck", condition="Progressing", status="false"}' values: '1+0x17 0+0x5' alert_rule_test: - eval_time: 14m alertname: KubeDeploymentRolloutStuck - eval_time: 16m alertname: KubeDeploymentRolloutStuck exp_alerts: - exp_labels: job: kube-state-metrics namespace: monitoring cluster: kubernetes deployment: stuck severity: warning condition: Progressing status: "false" exp_annotations: summary: 'Deployment rollout is not progressing.' description: 'Rollout of deployment monitoring/stuck is not progressing for longer than 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck - eval_time: 18m alertname: KubeDeploymentRolloutStuck - interval: 1m # Misscheduled is non zero. input_series: - series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4' - series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0' - series: 'kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4' - series: 'kube_daemonset_status_number_available{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4' alert_rule_test: - eval_time: 32m alertname: KubeDaemonSetRolloutStuck - eval_time: 33m alertname: KubeDaemonSetRolloutStuck exp_alerts: - exp_labels: job: kube-state-metrics namespace: monitoring cluster: kubernetes daemonset: node-exporter severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck - interval: 1m # Updated number unequal desired. input_series: - series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - series: 'kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4' - series: 'kube_daemonset_status_number_available{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4' alert_rule_test: - eval_time: 32m alertname: KubeDaemonSetRolloutStuck - eval_time: 33m alertname: KubeDaemonSetRolloutStuck exp_alerts: - exp_labels: job: kube-state-metrics namespace: monitoring cluster: kubernetes daemonset: node-exporter severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck - interval: 1m # Number available unequal desired. input_series: - series: 'kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 3 4 4 4 3 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - series: 'kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 0 0 0 1 1 1 1 2 2 2 2 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4' - series: 'kube_daemonset_status_number_available{job="kube-state-metrics",cluster="kubernetes",namespace="monitoring",daemonset="node-exporter"}' values: '4 4 4 3 3 3 4 3 3 3 4 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4' alert_rule_test: - eval_time: 34m alertname: KubeDaemonSetRolloutStuck - eval_time: 35m alertname: KubeDaemonSetRolloutStuck exp_alerts: - exp_labels: job: kube-state-metrics namespace: monitoring cluster: kubernetes daemonset: node-exporter severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 36m alertname: KubeDaemonSetRolloutStuck - interval: 1m input_series: - series: 'kubelet_certificate_manager_client_ttl_seconds{job="kubelet",cluster="kubernetes",namespace="monitoring",node="minikube"}' values: '86400-60x1' alert_rule_test: - eval_time: 0m alertname: KubeletClientCertificateExpiration exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet client certificate is about to expire." description: 'Client certificate for Kubelet on node minikube expires in 1d 0h 0m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - eval_time: 1m alertname: KubeletClientCertificateExpiration exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet client certificate is about to expire." description: 'Client certificate for Kubelet on node minikube expires in 23h 59m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: critical exp_annotations: summary: "Kubelet client certificate is about to expire." description: 'Client certificate for Kubelet on node minikube expires in 23h 59m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - interval: 1m input_series: - series: 'kubelet_certificate_manager_server_ttl_seconds{job="kubelet",cluster="kubernetes",namespace="monitoring",node="minikube"}' values: '86400-60x1' alert_rule_test: - eval_time: 0m alertname: KubeletServerCertificateExpiration exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet server certificate is about to expire." description: 'Server certificate for Kubelet on node minikube expires in 1d 0h 0m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - eval_time: 1m alertname: KubeletServerCertificateExpiration exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet server certificate is about to expire." description: 'Server certificate for Kubelet on node minikube expires in 23h 59m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: critical exp_annotations: summary: "Kubelet server certificate is about to expire." description: 'Server certificate for Kubelet on node minikube expires in 23h 59m 0s.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - interval: 1m input_series: - series: 'kubelet_certificate_manager_client_expiration_renew_errors{job="kubelet",cluster="kubernetes",namespace="monitoring",node="minikube"}' values: '0+1x20' alert_rule_test: - eval_time: 16m alertname: KubeletClientCertificateRenewalErrors exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet has failed to renew its client certificate." description: 'Kubelet on node minikube has failed to renew its client certificate (5 errors in the last 5 minutes).' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors - interval: 1m input_series: - series: 'kubelet_server_expiration_renew_errors{job="kubelet",cluster="kubernetes",namespace="monitoring",node="minikube"}' values: '0+1x20' alert_rule_test: - eval_time: 16m alertname: KubeletServerCertificateRenewalErrors exp_alerts: - exp_labels: job: kubelet namespace: monitoring cluster: kubernetes node: minikube severity: warning exp_annotations: summary: "Kubelet has failed to renew its server certificate." description: 'Kubelet on node minikube has failed to renew its server certificate (5 errors in the last 5 minutes).' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors - interval: 1m input_series: - series: 'kube_job_failed{instance="instance1",condition="true",job="kube-state-metrics",job_name="job-1597623120",cluster="kubernetes",namespace="ns1"}' values: '1+0x20' alert_rule_test: - eval_time: 15m alertname: KubeJobFailed exp_alerts: - exp_labels: cluster: "kubernetes" namespace: ns1 job_name: job-1597623120 severity: warning condition: true instance: instance1 job: kube-state-metrics exp_annotations: summary: "Job failed to complete." description: "Job ns1/job-1597623120 failed to complete. Removing failed job after investigation should clear this alert." runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed" - interval: 1m input_series: - series: 'kube_job_status_start_time{cluster="kubernetes",namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}' values: '0+0x200 _x500 0+0x40' - series: 'kube_job_status_active{cluster="kubernetes",namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}' values: '1x200 _x500 1x40' alert_rule_test: - eval_time: 6h alertname: KubeJobNotCompleted - eval_time: 12h1m alertname: KubeJobNotCompleted exp_alerts: - exp_labels: cluster: "kubernetes" namespace: ns1 job_name: job1 severity: warning exp_annotations: summary: "Job did not complete in time" description: "Job ns1/job1 is taking more than 12h 0m 0s to complete." runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted" - interval: 1m input_series: - series: 'kube_job_status_start_time{cluster="kubernetes",namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}' values: '0+0x740' - series: 'kube_job_status_active{cluster="kubernetes",namespace="ns1", job="kube-state-metrics", instance="instance1", job_name="job1"}' values: '1+0x710 0x30' alert_rule_test: - eval_time: 6h alertname: KubeJobNotCompleted - eval_time: 12h alertname: KubeJobNotCompleted - interval: 1m input_series: - series: 'apiserver_request_terminations_total{job="kube-apiserver",apiserver="kube-apiserver"}' values: '1+1x10' - series: 'apiserver_request_total{job="kube-apiserver",apiserver="kube-apiserver"}' values: '1+2x10' alert_rule_test: - eval_time: 5m # alert hasn't fired alertname: KubeAPITerminatedRequests - eval_time: 10m # alert fired alertname: KubeAPITerminatedRequests exp_alerts: - exp_labels: severity: warning exp_annotations: summary: "The kubernetes apiserver has terminated 33.33% of its incoming requests." description: "The kubernetes apiserver has terminated 33.33% of its incoming requests." runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests" - interval: 1m input_series: - series: 'kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",cluster="kubernetes",namespace="test",pod="static-web",container="script",job="kube-state-metrics"}' values: '1 1 stale _x3 1 1 stale _x2 1+0x4 stale' alert_rule_test: - eval_time: 10m # alert hasn't fired alertname: KubePodCrashLooping - eval_time: 16m # alert fired alertname: KubePodCrashLooping exp_alerts: - exp_labels: severity: "warning" container: "script" job: "kube-state-metrics" cluster: "kubernetes" namespace: "test" pod: "static-web" reason: "CrashLoopBackOff" exp_annotations: description: 'Pod test/static-web (script) is in waiting state (reason: "CrashLoopBackOff").' runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" summary: "Pod is crash looping." - eval_time: 20m alertname: KubePodCrashLooping # alert fired for a period of 5 minutes after resolution because the alert looks back at the last 5 minutes of data and the range vector doesn't take stale samples into account exp_alerts: - exp_labels: severity: "warning" container: "script" job: "kube-state-metrics" cluster: "kubernetes" namespace: "test" pod: "static-web" reason: "CrashLoopBackOff" exp_annotations: description: 'Pod test/static-web (script) is in waiting state (reason: "CrashLoopBackOff").' runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" summary: "Pod is crash looping." - eval_time: 21m # alert recovers alertname: KubePodCrashLooping # When ResourceQuota has both cpu and requests.cpu, min value of those will be taken into account for quota calculation. - interval: 1m input_series: - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="cpu", type="hard", job="kube-state-metrics"}' values: '1000x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="requests.cpu", type="hard", job="kube-state-metrics"}' values: '100x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test1", resource="requests.cpu", type="hard", job="kube-state-metrics"}' values: '50x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n1", resource="cpu", job="kube-state-metrics"}' values: '100x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n2", resource="cpu", job="kube-state-metrics"}' values: '100x10' alert_rule_test: - eval_time: 4m alertname: KubeCPUQuotaOvercommit - eval_time: 5m # alert shouldn't fire alertname: KubeCPUQuotaOvercommit - interval: 1m input_series: - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="cpu", type="hard", job="kube-state-metrics"}' values: '1000x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="requests.cpu", type="hard", job="kube-state-metrics"}' values: '200x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test1", resource="requests.cpu", type="hard", job="kube-state-metrics"}' values: '200x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n1", resource="cpu", job="kube-state-metrics"}' values: '100x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n2", resource="cpu", job="kube-state-metrics"}' values: '100x10' alert_rule_test: - eval_time: 4m alertname: KubeCPUQuotaOvercommit - eval_time: 5m # alert shouldn't fire alertname: KubeCPUQuotaOvercommit exp_alerts: - exp_labels: severity: "warning" cluster: "kubernetes" exp_annotations: description: 'Cluster kubernetes has overcommitted CPU resource requests for Namespaces.' runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit" summary: "Cluster has overcommitted CPU resource requests." # When ResourceQuota has both memory and requests.memory, min value of those will be taken into account for quota calculation. - interval: 1m input_series: - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="memory", type="hard", job="kube-state-metrics"}' values: '1000x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="requests.memory", type="hard", job="kube-state-metrics"}' values: '100x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test1", resource="requests.memory", type="hard", job="kube-state-metrics"}' values: '50x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n1", resource="memory", job="kube-state-metrics"}' values: '100x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n2", resource="memory", job="kube-state-metrics"}' values: '100x10' alert_rule_test: - eval_time: 4m alertname: KubeMemoryQuotaOvercommit - eval_time: 5m # alert shouldn't fire alertname: KubeMemoryQuotaOvercommit - interval: 1m input_series: - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="memory", type="hard", job="kube-state-metrics"}' values: '1000x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test", resource="requests.memory", type="hard", job="kube-state-metrics"}' values: '500x10' - series: 'kube_resourcequota{cluster="kubernetes",namespace="test1", resource="requests.memory", type="hard", job="kube-state-metrics"}' values: '500x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n1", resource="memory", job="kube-state-metrics"}' values: '10x10' - series: 'kube_node_status_allocatable{cluster="kubernetes",namespace="monitoring",node="n2", resource="memory", job="kube-state-metrics"}' values: '10x10' alert_rule_test: - eval_time: 4m alertname: KubeMemoryQuotaOvercommit - eval_time: 5m # alert shouldn't fire alertname: KubeMemoryQuotaOvercommit exp_alerts: - exp_labels: severity: "warning" cluster: "kubernetes" exp_annotations: description: 'Cluster kubernetes has overcommitted memory resource requests for Namespaces.' runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit" summary: "Cluster has overcommitted memory resource requests."