terraform/modules/alerts/main.tf (63 lines of code) (raw):

# Copyright 2022 The TestGrid Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals {} resource "google_monitoring_alert_policy" "probers" { project = var.project provider = google-beta // To include `condition_monitoring_query_language` display_name = "HostDown" combiner = "OR" conditions { display_name = "Host is unreachable" condition_monitoring_query_language { duration = "120s" query = <<-EOT fetch uptime_url | metric 'monitoring.googleapis.com/uptime_check/check_passed' | align next_older(1m) | filter resource.project_id == '${var.project}' | every 1m | group_by [resource.host], [value_check_passed_not_count_true: count_true(not(value.check_passed))] | condition val() > 1 '1' EOT trigger { count = 1 } } } documentation { content = "Host Down" mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-prow notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] } resource "google_monitoring_alert_policy" "pubsub-unack-too-old" { project = var.project provider = google-beta // To include `condition_monitoring_query_language` for_each = var.pubsub_topics display_name = "pubsub-unack-too-old/${var.project}/${each.key}" combiner = "OR" # required conditions { display_name = "pubsub-unack-too-old/${var.project}/${each.key}" condition_monitoring_query_language { duration = "60s" query = <<-EOT fetch pubsub_subscription | metric 'pubsub.googleapis.com/subscription/oldest_unacked_message_age' | filter (metadata.system_labels.topic_id == '${each.key}') | group_by 30m, [value_oldest_unacked_message_age_mean: mean(value.oldest_unacked_message_age)] | every 30m | condition val() > 1.08e+07 's' EOT trigger { count = 1 } } } documentation { content = "${var.project}: TestGrid is not acknowledging PubSub messages in time.\n\nOncall Playbook: http://go/test-infra-playbook" mime_type = "text/markdown" } # gcloud beta monitoring channels list --project=oss-REPLACE notification_channels = ["projects/${var.project}/notificationChannels/${var.notification_channel_id}"] }