alerts/google-gke/metadata.yaml (149 lines of code) (raw):

alert_policy_templates: - id: cpu-limit-utilization-all-containers description: "Monitors CPU limit utilization across all containers in the current project and alerts if a container's CPU limit utilization is above 90% on average for 5 minutes." version: 1 related_integrations: - id: gke platform: GCP - id: cpu-limit-utilization-containers-within-cluster description: "Monitors CPU limit utilization across all containers within a cluster and alerts if a container's CPU limit utilization is above 90% on average for 5 minutes." version: 1 - id: failedscheduling-log-event-within-cluster description: "Monitors log events with cluster and alerts if there is an event with reason 'FailedScheduling'." version: 1 - id: memory-limit-utilization-all-containers description: "Monitors memory limit utilization across all containers in the current project and alerts if a container's memory limit utilization is above 90% on average for 1 minute." version: 1 related_integrations: - id: gke platform: GCP - id: memory-limit-utilization-containers-within-cluster description: "Monitors memory limit utilization across all containers within a cluster and alerts if a container's memory limit utilization is above 90% on average for 1 minute." version: 1 - id: restarts-all-containers description: "Monitors all containers in the current project clusters and alerts if a container restarts more than once. (Restart count for the container is more than 1 within 5 minute window)" version: 1 related_integrations: - id: gke platform: GCP - id: restarts-containers-within-cluster description: "Monitors containers within a cluster and alerts if a container restarts more than once. (Restart count for the container is more than 1 within 5 minute window)" version: 1 - id: cpu-request-utilization-entire-workload description: "Alerts if the workload's CPU request utilization exceeds 90% for 5 minutes." version: 1 - id: cpu-limit-utilization-entire-workload description: "Alerts if the workload's CPU limit utilization exceeds 90% for 5 minutes." version: 1 - id: memory-request-utilization-entire-workload description: "Alerts if the workload's Memory request utilization exceeds 90% for 5 minutes." version: 1 - id: memory-limit-utilization-entire-workload description: "Alerts if the workload's Memory limit utilization exceeds 90% for 5 minutes." version: 1 - id: istio-server-errors-entire-workload description: "Alerts if the per-operation 5xx error rate exceeds 5% for 5 minutes." version: 1 - id: istio-latency-entire-workload description: "Alerts if the per-operation p95 latency exceeds 10 seconds for 5 minutes." version: 1 - id: istio-gmp-server-errors-entire-workload description: "Alerts if the 5xx error rate exceeds 5% for 5 minutes." version: 1 - id: istio-gmp-latency-entire-workload description: "Alerts if the p95 latency exceeds 10 seconds for 5 minutes." version: 1 - id: gke-ingress-server-errors-entire-workload description: "Alerts if the 5xx error rate for a URL path exceeds 5% for 5 minutes." version: 1 - id: gke-ingress-latency-entire-workload description: "Alerts if the p95 latency for a URL path exceeds 10 seconds for 5 minutes." version: 1 - id: nginx-ingress-server-errors-entire-workload description: "Alerts if the 5xx error rate for a method in a path exceeds 5% for 5 minutes." version: 1 - id: nginx-ingress-latency-entire-workload description: "Alerts if the p95 latency for a method in a path exceeds 10 seconds for 5 minutes." version: 1 - id: http-gmp-server-errors-entire-workload description: "Alerts if the 5xx error rate for a method exceeds 5% for 5 minutes." version: 1 - id: http-gmp-latency-entire-workload description: "Alerts if the p95 latency for a method exceeds 10 seconds for 5 minutes." version: 1 - id: grpc-server-errors-entire-workload description: "Alerts if the server error rate for a method exceeds 5% for 5 minutes." version: 1 - id: grpc-latency-entire-workload description: "Alerts if the p95 latency for a method exceeds 10 seconds for 5 minutes." version: 1 - id: utilization-container-cpu-within-workload description: "Alerts if a container's CPU limit utilization is above 90% on average for 5 minutes." version: 1 - id: utilization-container-memory-within-workload description: "Alerts if a container's memory limit utilization is above 90% on average for 1 minute." version: 1 - id: restarts-containers-within-workload description: "Alerts if a container restarts within a 5 minute window" version: 1 - id: cpu-limit-utilization-for-jobset description: "Alerts if the jobset's CPU limit utilization exceeds 90% for 5 minutes." version: 1 related_integrations: - id: gke platform: GCP - id: memory-limit-utilization-for-jobset description: "Alerts if the jobset's memory limit utilization exceeds 90% for 5 minutes." version: 1 related_integrations: - id: gke platform: GCP - id: memory-request-utilization-for-jobset description: "Alerts if the jobset's memory request utilization exceeds 90% for 5 minutes." version: 1 related_integrations: - id: gke platform: GCP - id: tpu-tensorcore-utilization-for-jobset description: "Alerts if the jobset's TPU tensorcore utilization exceeds 90% for 5 minutes." version: 1 related_integrations: - id: gke platform: GCP - id: jobs-failure-for-jobset description: "Alerts if one or more jobs fail within a jobset." version: 1 related_integrations: - id: gke platform: GCP