prow/oss/terraform/modules/alerts/main.tf (241 lines of code) (raw):

# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { sinker_monitoring_resources = { pods : "sinker_pods_removed" prowjobs : "sinker_prow_jobs_cleaned" } // Flatten var.prow_instances into a map. // https://www.terraform.io/docs/language/functions/flatten.html#flattening-nested-structures-for-for_each project_indexed_components = { for elem in flatten([ for project, components in var.prow_instances : [ for component, details in components : { project = project, component = component, namespace = details.namespace } ] ]) : "${elem.project}/${elem.component}" => elem } } resource "google_monitoring_alert_policy" "sinker-alerts" { project = var.project for_each = local.sinker_monitoring_resources display_name = "sinker-not-deleting-${each.key}" combiner = "OR" # required conditions { display_name = "Sinker not deleting ${each.key}" condition_monitoring_query_language { duration = "300s" query = <<-EOT fetch k8s_container | metric 'workload.googleapis.com/${each.value}' | group_by [resource.project_id], 1h, [value_sinker_removed_sum: sum(value.${each.value})] | every 1h | condition val() < 1 EOT trigger { count = 1 } } } documentation { content = "Sinker not deleting any ${each.key} in an hour." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "predicted-gh-rate-limit-exhaustion" { project = var.project display_name = "predicted-gh-rate-limit-exhaustion" combiner = "OR" # required conditions { display_name = "predicted-gh-rate-limit-exhaustion" condition_monitoring_query_language { # This query calculates the expected remaining rate limit tokens at the # end of the current rate limit reset window based on rate of consumption # over the last 10 minutes. The alert fires if we predict that we will # come within 250 tokens of the limit (which is 5% of the 5000 limit). duration = "0s" query = <<-EOT {t_0: # The remaining tokens in the rate limit reset window fetch k8s_container::workload.googleapis.com/github_token_usage ; t_1: # The expected tokens that will be used over the remaining time in the rate limit reset window based on recent usage. fetch k8s_container | { metric 'workload.googleapis.com/github_token_usage' | value 5000 - val() | align rate(1m) # Align rate over 1m and filter before actual 15m aggregation to drop counter resets in gauge. | filter val() > 0 | align mean_aligner(15m) ; metric 'workload.googleapis.com/github_token_reset' | value val() / (1000000000) } | outer_join 0 | mul } | outer_join 0 | sub # Result is the expected remaining tokens at the end of the rate limit reset window. | every 1m | filter metric.token_hash =~ "${join("|", var.bot_token_hashes)}" | condition val() < 250 | window 1m EOT trigger { count = 1 } } } documentation { content = "One of the GitHub tokens used with `ghproxy` is predicted to exhaust its rate limit before the end of the rate limit reset window." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "pod-crashlooping" { for_each = local.project_indexed_components project = var.project display_name = "pod-crashlooping-${each.key}" combiner = "OR" # required conditions { display_name = "pod-crashlooping-${each.key}" condition_monitoring_query_language { # Alert if the service crashlooped, which results in restarts with exponential backoff. # This threshold is higher than the default crashloop backoff threshold, mainly due to # the fact prow components would crashloop when kubernetes master is upgrading, which # normally takes 5 minutes. Setting this to 12 minutes for excluding this case duration = "720s" query = <<-EOT fetch k8s_container | metric 'kubernetes.io/container/restart_count' | filter (resource.project_id == '${each.value.project}' && resource.container_name == '${each.value.component}' && resource.namespace_name == '${each.value.namespace}') | align delta(6m) | every 6m | group_by [], [value_restart_count_aggregate: aggregate(value.restart_count)] | condition val() > 0 '1' EOT trigger { count = 1 } } } documentation { content = "The service `${each.key}` has been restarting for more than 6 minutes, very likely crashlooping." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "heartbeat-job-stale" { for_each = { for job in var.heartbeat_jobs : job.job_name => job } project = var.project display_name = "heartbeat-job-stale/${each.key}" combiner = "OR" # required conditions { display_name = "heartbeat-job-stale/${each.key}" condition_monitoring_query_language { duration = each.value.alert_interval query = <<-EOT fetch k8s_container | metric 'workload.googleapis.com/prowjob_state_transitions' | filter (metric.job_name == '${each.value.job_name}' && metric.state == 'success') | sum # Combining values reported by all prow-controller-manager pods | align delta_gauge(${each.value.interval}) | every ${each.value.interval} | condition val() == 0 EOT trigger { count = 1 } } } documentation { content = " The heartbeat job `${each.value.job_name}` has not had a successful run in the past ${each.value.alert_interval} (should run every ${each.value.interval})." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "probers" { project = var.project display_name = "HostDown" combiner = "OR" conditions { display_name = "Host is unreachable" condition_monitoring_query_language { duration = "120s" query = <<-EOT fetch uptime_url | metric 'monitoring.googleapis.com/uptime_check/check_passed' | align next_older(1m) | filter resource.project_id == '${var.project}' | every 1m | group_by [resource.host], [value_check_passed_not_count_true: count_true(not(value.check_passed))] | condition val() > 1 '1' EOT trigger { count = 1 } } } documentation { content = "Host Down" mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "webhook-missing" { for_each = var.no_webhook_alert_minutes project = var.project display_name = "webhook-missing/${each.key}" combiner = "OR" # required conditions { display_name = "webhook-missing/${each.key}" condition_monitoring_query_language { duration = "${each.value * 60}s" query = <<-EOT fetch k8s_container | metric 'workload.googleapis.com/prow_webhook_counter' | filter resource.project_id == '${each.key}' | sum | align delta_gauge(1m) | every 1m | value add [hour: end().timestamp_to_string("%H", "America/Los_Angeles").string_to_int64] | value add [day_of_week: end().timestamp_to_string("%u", "America/Los_Angeles").string_to_int64] | value add [is_weekend: if(day_of_week >= 6, 1, 0)] | value add [is_business_hour: if((hour >= 9) && (hour < 17), 1, 0)] | condition val(0) == 0 && is_business_hour * (1-is_weekend) == 1 EOT trigger { count = 1 } } } documentation { content = "${each.key} has received no webhooks for ${each.value} minutes during work hours." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "KES-Secret-Sync-Error" { project = var.project display_name = "Kubernetes External Secret: Secret-Sync-Error" combiner = "OR" # required conditions { display_name = "Secret-Sync-Error" condition_monitoring_query_language { duration = "0s" query = <<-EOT fetch k8s_container | metric 'workload.googleapis.com/kubernetes_external_secrets_sync_calls_count' | align delta(60s) | filter metric.status != "success" | condition val() > 1.5 EOT trigger { count = 1 } } } documentation { content = "Kubernetes External Secrets has encountered errors while syncing." mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] }