terraform/modules/avs_service_health/main.tf (140 lines of code) (raw):

locals { Alerts = [ { Name : "CPU" Description : "CPU Usage per Cluster" Metric : "EffectiveCpuAverage" SplitDimension : "clustername" Threshold : 80 Severity : 2 }, { Name : "CPUCritical" Description : "CPU Usage per Cluster - Critical" Metric : "EffectiveCpuAverage" SplitDimension : "clustername" Threshold : 95 Severity : 0 }, { Name : "Memory" Description : "Memory Usage per Cluster" Metric : "UsageAverage" SplitDimension : "clustername" Threshold : 80 Severity : 2 }, { Name : "MemoryCritical" Description : "Memory Usage per Cluster - Critical" Metric : "UsageAverage" SplitDimension : "clustername" Threshold : 95 Severity : 0 }, { Name : "Storage" Description : "Storage Usage per Datastore" Metric : "DiskUsedPercentage" SplitDimension : "dsname" Threshold : 70 Severity : 2 }, { Name : "StorageCritical" Description : "Storage Usage per Datastore" Metric : "DiskUsedPercentage" SplitDimension : "dsname" Threshold : 75 Severity : 0 } ] } resource "azurerm_monitor_action_group" "avs_service_health" { name = var.action_group_name resource_group_name = var.rg_name short_name = var.action_group_shortname dynamic "email_receiver" { for_each = toset(var.email_addresses) content { name = trimspace(split("@", email_receiver.key)[0]) email_address = trimspace(email_receiver.key) } } } resource "azurerm_monitor_activity_log_alert" "avs_rg_service_health" { name = var.service_health_alert_name resource_group_name = var.rg_name scopes = [var.service_health_alert_scope_id] description = "This alert monitors service health for the AVS SDDC resource group." criteria { category = "ServiceHealth" service_health { locations = ["Global"] } } action { action_group_id = azurerm_monitor_action_group.avs_service_health.id } } resource "azurerm_monitor_metric_alert" "avs_metric_alerts" { for_each = { for alert in local.Alerts : alert.Name => alert } name = each.value.Name resource_group_name = var.rg_name scopes = [var.private_cloud_id] description = each.value.Description severity = each.value.Severity frequency = "PT5M" window_size = "PT30M" enabled = true auto_mitigate = true criteria { metric_namespace = "microsoft.avs/privateclouds" metric_name = each.value.Metric aggregation = "Average" operator = "GreaterThan" threshold = each.value.Threshold dimension { name = each.value.SplitDimension operator = "Include" values = ["*"] } } action { action_group_id = azurerm_monitor_action_group.avs_service_health.id } } ############################################################################################# # Telemetry Section - Toggled on and off with the telemetry variable # This allows us to get deployment frequency statistics for deployments # Re-using parts of the Core Enterprise Landing Zone methodology ############################################################################################# locals { #create an empty ARM template to use for generating the deployment value telem_arm_subscription_template_content = <<TEMPLATE { "$schema": "https://schema.management.azure.com/schemas/2018-05-01/subscriptionDeploymentTemplate.json#", "contentVersion": "1.0.0.0", "parameters": {}, "variables": {}, "resources": [], "outputs": { "telemetry": { "type": "String", "value": "For more information, see https://aka.ms/alz/tf/telemetry" } } } TEMPLATE module_identifier = lower("avs_service_health") telem_arm_deployment_name = "${lower(var.guid_telemetry)}.${substr(local.module_identifier, 0, 20)}.${random_string.telemetry.result}" } data "azurerm_resource_group" "deployment" { name = var.rg_name } #create a random string for uniqueness resource "random_string" "telemetry" { length = 4 special = false upper = false lower = true } resource "azurerm_subscription_template_deployment" "telemetry_core" { count = var.module_telemetry_enabled ? 1 : 0 name = local.telem_arm_deployment_name location = data.azurerm_resource_group.deployment.location template_content = local.telem_arm_subscription_template_content }