hasher-matcher-actioner/terraform/dashboard/main.tf (274 lines of code) (raw):
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
data "aws_region" "current" {}
locals {
title_glance = jsonencode({
height = 1,
width = 24,
type = "text",
properties = {
markdown = "## Hashing, Matching & Actioning at a Glance"
}
})
title_dynamodb = jsonencode({
height = 1,
width = 24,
type = "text",
properties = {
markdown = "# DynamoDB"
}
})
title_api = jsonencode({
height = 1,
width = 24,
type = "text",
properties = {
markdown = "# API"
}
})
title_system_capacity = jsonencode({
height = 1,
width = 24,
type = "text",
properties = {
markdown = "# System Capacity"
}
})
title_submit_event = jsonencode({
height = 1,
width = 24,
type = "text",
properties = {
markdown = "# Submit Events (Submissions over SNS)"
}
})
pipeline_lambdas_widgets = [for lambda in var.pipeline_lambdas : templatefile(
"${path.module}/lambda_widget.tpl", {
region = data.aws_region.current.name, lambda_name = lambda[1], lambda_title = lambda[0]
}
)]
queues_to_monitor_items = [for queue in var.queues_to_monitor : jsonencode({
width = 6
type = "metric"
properties = {
title = "${queue[0]}: Age of Oldest Item"
region = "${data.aws_region.current.name}"
period = 60
stat = "Maximum"
metrics = [
[
"AWS/SQS", "ApproximateNumberOfMessagesVisible",
"QueueName", "${queue[1]}", { stat = "Average" }
],
[".", "ApproximateAgeOfOldestMessage", ".", ".", { yAxis = "right" }],
[
"AWS/SQS", "ApproximateNumberOfMessagesVisible",
"QueueName", "${queue[2]}", { stat = "Average", label = "dlq-approx-count" }
],
[".", "ApproximateAgeOfOldestMessage", ".", ".", { yAxis = "right", label = "dlq-approx-age" }]
]
}
})
]
all_lambda_names = flatten([
[for lambda in var.pipeline_lambdas : lambda[1]], var.other_lambdas
])
total_concurrent_lambda = jsonencode({
width = 12
type = "metric"
properties = {
title = "Total Concurrent λ Executions"
region = "${data.aws_region.current.name}"
stacked = true
stat = "Maximum"
period = 60
metrics = [for lambda in local.all_lambda_names : ["AWS/Lambda", "ConcurrentExecutions", "FunctionName", "${lambda}"]]
}
})
api_lambda_widget = jsonencode({
height = 6,
width = 6,
type = "metric",
properties = {
metrics = [
["AWS/Lambda", "Invocations", "FunctionName", "${var.api_lambda_name}", { label = "Requests" }],
[".", "Errors", ".", ".", { color = "#d62728", yAxis = "left" }],
[".", "ConcurrentExecutions", ".", ".", { stat = "Maximum", yAxis = "right" }],
[".", "Throttles", ".", ".", { color = "#ff9896", label = "Throttles" }]
],
period = 60,
region = "${data.aws_region.current.name}",
stat = "Sum",
title = "API Requests & λ Concurrency ",
view = "timeSeries",
stacked = false
}
})
# Currently copies api_lambda_widget. Elected to not create another .tpl file
# as the metrics we care about for each might soon differ.
api_auth_lambda_widget = jsonencode({
height = 6,
width = 6,
type = "metric",
properties = {
metrics = [
["AWS/Lambda", "Invocations", "FunctionName", "${var.auth_lambda_name}", { label = "Requests" }],
[".", "Errors", ".", ".", { color = "#d62728", yAxis = "left" }],
[".", "ConcurrentExecutions", ".", ".", { stat = "Maximum", yAxis = "right" }],
[".", "Throttles", ".", ".", { color = "#ff9896", label = "Throttles" }]
],
period = 60,
region = "${data.aws_region.current.name}",
stat = "Sum",
title = "Auth Requests & λ Concurrency ",
view = "timeSeries",
stacked = false
}
})
api_response_times = jsonencode({
height = 6,
width = 6,
type = "metric",
properties = {
metrics = [
["AWS/Lambda", "Duration", "FunctionName", "${var.api_lambda_name}", { label = "p90", stat = "p90", yAxis = "right" }],
["...", { color = "#ff7f0e", label = "Av", stat = "Average", yAxis = "right" }]
],
period = 60,
region = "${data.aws_region.current.name}",
stat = "Sum",
title = "API Response Times (p90 and avg)",
view = "timeSeries",
stacked = false
}
})
api_gateway_widget = jsonencode({
width = 6
type = "metric"
properties = {
metrics = [
[{ expression = "100*(m1/m3)", label = "4xx %age", id = "e1", color = "#ff7f0e", yAxis = "right" }],
[{ expression = "100*(m2/m3)", label = "5xx %age", id = "e2", color = "#d62728", yAxis = "right" }],
["AWS/ApiGateway", "4xx", "Stage", "$default", "ApiId", "mc620fy2hf", { id = "m1", visible = false }],
[".", "5xx", ".", ".", ".", ".", { color = "#d62728", id = "m2", visible = false }],
[".", "Count", ".", ".", ".", ".", { id = "m3", label = "Requests" }]
],
period = 60,
region = "${data.aws_region.current.name}"
stat = "Sum",
title = "API Request Volume and %age 4xx, 5xx",
view = "timeSeries",
stacked = false,
yAxis = {
right = {
min = 0,
max = 100
}
}
}
})
dynamodb_datastore_rwcu_widget = jsonencode({
width = 8
type = "metric"
period = 60
properties = {
title = "Read/Write Capacity Units Utilized",
region = "${data.aws_region.current.name}",
period = 60
stat = "Sum",
metrics = [
["AWS/DynamoDB", "ConsumedReadCapacityUnits", "TableName", "${var.datastore.name}", { yAxis = "left", label = "Primary RCUs" }],
[".", "ConsumedWriteCapacityUnits", ".", ".", { label = "Primary WCUs", color = "#aec7e8" }],
[".", "ConsumedReadCapacityUnits", ".", ".", "GlobalSecondaryIndexName", "GSI-2", { yAxis = "left", label = "GSI-2 RCUs" }],
[".", "ConsumedWriteCapacityUnits", ".", ".", ".", ".", { label = "GSI-2 WCUs", color = "#98df8a" }],
[".", "ConsumedReadCapacityUnits", ".", ".", ".", "GSI-1", { yAxis = "left", label = "GSI-1 RCUs" }],
[".", "ConsumedWriteCapacityUnits", ".", ".", ".", ".", { label = "GSI-1 WCUs", color = "#c5b0d5" }]
]
}
})
dynamodb_datastore_errors_widget = jsonencode({
width = 8
type = "metric"
properties = {
title = "Errors, Throttles & Conflicts",
region = "${data.aws_region.current.name}",
period = 60
stat = "Sum",
metrics = [
["AWS/DynamoDB", "ThrottledRequests", "TableName", "${var.datastore.name}", "Operation", "PutItem", { label = "Throttled Puts" }],
["...", "UpdateItem", { label = "Throttled Updates", color = "#17becf" }],
["...", "TransactWriteItems", { label = "Throttled Batch Writes", color = "#bcbd22" }],
[".", "TransactionConflict", ".", ".", { yAxis = "right" }],
[".", "WriteThrottleEvents", ".", ".", { yAxis = "right" }]]
}
})
# Should we add the submit event widgets to the dashboard?
add_submit_event_widgets = var.submit_event_lambda_name != null
submit_event_lambda_widget = jsonencode({
height = 6,
width = 6,
type = "metric",
properties = {
metrics = [
["AWS/Lambda", "Invocations", "FunctionName", "${var.submit_event_lambda_name}", { label = "SubmitEvents" }],
[".", "Errors", ".", ".", { color = "#d62728", yAxis = "left" }],
[".", "ConcurrentExecutions", ".", ".", { stat = "Maximum", yAxis = "right" }],
[".", "Throttles", ".", ".", { color = "#ff9896", label = "Throttles" }]
],
period = 60,
region = "${data.aws_region.current.name}",
stat = "Sum",
title = "Submit Event & λ Concurrency ",
view = "timeSeries",
stacked = false
}
})
submit_queue_widge = local.add_submit_event_widgets ? jsonencode({
width = 6
type = "metric"
properties = {
title = "${var.submit_event_queue[0]}: Age of Oldest Item"
region = "${data.aws_region.current.name}"
period = 60
stat = "Maximum"
metrics = [
[
"AWS/SQS", "ApproximateNumberOfMessagesVisible",
"QueueName", "${var.submit_event_queue[1]}", { stat = "Average" }
],
[".", "ApproximateAgeOfOldestMessage", ".", ".", { yAxis = "right" }],
[
"AWS/SQS", "ApproximateNumberOfMessagesVisible",
"QueueName", "${var.submit_event_queue[2]}", { stat = "Average", label = "dlq-approx-count" }
],
[".", "ApproximateAgeOfOldestMessage", ".", ".", { yAxis = "right", label = "dlq-approx-age" }]
]
}
}) : jsonencode({})
dashboard_body = <<JSON
{
"widgets": [
${local.title_glance},
${join(", ", local.pipeline_lambdas_widgets)},
${join(", ", local.queues_to_monitor_items)},
${local.title_api},
${local.api_auth_lambda_widget},
${local.api_lambda_widget},
${local.api_response_times},
${local.api_gateway_widget},
${local.title_dynamodb},
${local.dynamodb_datastore_rwcu_widget},
${local.dynamodb_datastore_errors_widget},
${local.title_system_capacity},
${local.total_concurrent_lambda}
%{if local.add_submit_event_widgets}
,
${local.title_submit_event},
${local.submit_event_lambda_widget},
${local.submit_queue_widge}
%{endif}
]
}
JSON
}
resource "aws_cloudwatch_dashboard" "basic_dashboard" {
dashboard_name = var.name
// tf converts numbers to strings when putting them in a list.
// this strip quotes around numbers, so that {"value": "123"} turns into {"value": 123}
dashboard_body = replace(local.dashboard_body, "/\"([0-9]+)\"/", "$1")
}