cdk/lib/monitoring.ts (100 lines of code) (raw):
import type { GuStackProps } from '@guardian/cdk/lib/constructs/core';
import { GuStack } from '@guardian/cdk/lib/constructs/core';
import { GuLambdaFunction } from '@guardian/cdk/lib/constructs/lambda';
import type { App } from 'aws-cdk-lib';
import { Duration } from 'aws-cdk-lib';
import type {
IAlarmAction} from 'aws-cdk-lib/aws-cloudwatch';
import {
Alarm,
ComparisonOperator,
Metric,
TreatMissingData,
Unit
} from 'aws-cdk-lib/aws-cloudwatch';
import { SnsAction } from 'aws-cdk-lib/aws-cloudwatch-actions';
import { Rule, RuleTargetInput, Schedule } from 'aws-cdk-lib/aws-events';
import { LambdaFunction } from 'aws-cdk-lib/aws-events-targets';
import { Effect, PolicyStatement } from 'aws-cdk-lib/aws-iam';
import { Runtime, RuntimeManagementMode } from 'aws-cdk-lib/aws-lambda';
import { Topic } from 'aws-cdk-lib/aws-sns';
import { EmailSubscription } from 'aws-cdk-lib/aws-sns-subscriptions';
export class Monitoring extends GuStack {
constructor(scope: App, id: string, props: GuStackProps) {
super(scope, id, props);
const stage = this.stage;
const region = props.env?.region ?? 'eu-west-1';
const lambdaBaseName = 'cmp-monitoring';
const runTimeId = "0cdcfbdefbc5e7d3343f73c2e2dd3cba17d61dea0686b404502a0c9ce83931b9";
const prodDurationInMinutes = 2;
const policyStatement = new PolicyStatement({
effect: Effect.ALLOW,
actions: ['cloudwatch:PutMetricData'],
resources: ['*'],
});
const runTimeManagementArn = `arn:aws:lambda:${region}::runtime:${runTimeId}`
const monitoringLambdaFunction = new GuLambdaFunction(
this,
lambdaBaseName,
{
app: `${lambdaBaseName}-lambda-${region}`,
functionName: `${lambdaBaseName}-${stage}`,
fileName: `${lambdaBaseName}-lambda.zip`,
handler: 'index.handler',
runtime: Runtime.NODEJS_18_X,
runtimeManagementMode: RuntimeManagementMode.manual(runTimeManagementArn),
timeout: Duration.seconds(300),
memorySize: 2048,
initialPolicy: [policyStatement],
},
);
new Metric({
namespace: `Application`,
metricName: 'CmpLoadingTime',
period: Duration.minutes(1),
region: region,
unit: Unit.SECONDS,
});
// Defining metric for lambda errors each minute
const errorMetric = monitoringLambdaFunction.metricErrors({
period: Duration.minutes(prodDurationInMinutes),
});
// Defining metric for lambda errors each minute
monitoringLambdaFunction.metricInvocations({
period: Duration.minutes(prodDurationInMinutes),
});
const lambdaEventTarget = new LambdaFunction(monitoringLambdaFunction, {
event: RuleTargetInput.fromObject({
stage: 'PROD', // Both scheduled cmp-monitoring-CODE and cmp-monitoring-PROD are monitoring prod versions
region: region,
}),
});
const monitoringDuration: Duration =
stage === 'PROD' ? Duration.minutes(prodDurationInMinutes) : Duration.days(1); // Every day for CODE; Every 2 minutes for PROD.
new Rule(this, 'cmp monitoring schedule', {
schedule: Schedule.rate(monitoringDuration),
targets: [lambdaEventTarget],
});
// Error Alarm
const alarm = new Alarm(this, 'cmp-monitoring-alarms', {
comparisonOperator:
ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
threshold: 1,
evaluationPeriods: 5, // This value is the number of periods to watch. Here, we're evaluating 5 executions of the lambda. The lambda is triggered every 2minutes so it's check over a 10 minute timeframe.
actionsEnabled: true,
datapointsToAlarm: 4, // This value is the number of failed data-points/executions that will trigger the alarm. so 4 out of 5
treatMissingData: TreatMissingData.NOT_BREACHING,
metric: errorMetric,
alarmName: `CMP Monitoring - ${stage} - ${region}`,
alarmDescription:
`This alarm is triggered if 4 out of 5 lambda executions fail in ${region}`,
});
if(this.stage === "PROD"){
const emailSubscription = new EmailSubscription(
"transparency.and.consent@guardian.co.uk"
);
const internalEmailMessaging = new Topic(this, "internalEmailRecipient");
internalEmailMessaging.addSubscription(emailSubscription);
const alarmAction: IAlarmAction = new SnsAction(internalEmailMessaging);
alarm.addAlarmAction(alarmAction)
alarm.addOkAction(alarmAction)
}
}
}