in cdk/lib/elastic-search-monitor.ts [19:182]
constructor(scope: App, id: string, props: GuStackProps) {
super(scope, id, props);
const clusterSecurityGroupId = new CfnParameter(
this,
"ClusterSecurityGroup",
{
type: "AWS::EC2::SecurityGroup::Id",
description:
"A security group allowing the lambda to connect to Elasticsearch on port 9200 over TCP",
}
);
const networkInterfacePolicy = new PolicyStatement({
actions: [
"ec2:CreateNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DeleteNetworkInterface",
],
resources: ["*"],
});
const ec2DetectionPolicy = new PolicyStatement({
actions: ["ec2:DescribeInstances"],
resources: ["*"],
});
const pushMetricsPolicy = new PolicyStatement({
actions: ["cloudwatch:PutMetricData"],
resources: ["*"],
});
const additionalPolicies: PolicyStatement[] = [
networkInterfacePolicy,
ec2DetectionPolicy,
pushMetricsPolicy,
];
const scheduledLambda = new GuScheduledLambda(this, "ScheduledLambda", {
app,
fileName: "elastic-search-monitor.jar",
environment: {
TAG_QUERY_APP: "elk-es-master",
TAG_QUERY_STACK: "deploy",
CLUSTER_NAME: "elk",
},
description:
"Monitors your Elasticsearch cluster and reports metrics to CloudWatch",
handler: "com.gu.elasticsearchmonitor.Lambda::handler",
monitoringConfiguration: {
toleratedErrorPercentage: 99,
numberOfEvaluationPeriodsAboveThresholdBeforeAlarm: 30,
snsTopicName: "devx-alerts",
},
rules: [{ schedule: Schedule.rate(Duration.minutes(1)) }],
runtime: Runtime.JAVA_11,
// This lambda needs access to the Deploy Tools VPC so that it can talk to Prism
vpc: GuVpc.fromIdParameter(this, "vpc"),
vpcSubnets: {
subnets: GuVpc.subnetsFromParameter(this),
},
securityGroups: [
SecurityGroup.fromSecurityGroupId(
this,
"ElasticsearchClusterSecurityGroup",
clusterSecurityGroupId.valueAsString
),
],
});
additionalPolicies.map((policy) => scheduledLambda.addToRolePolicy(policy));
const topicForElasticsearchAlerts = new Topic(this, "ElkAlertChannel", {
displayName: `ELK alert channel for ${this.stage}`,
topicName: `elk-alerts-${this.stage}`,
});
topicForElasticsearchAlerts.addSubscription(
new EmailSubscription("devx.sec.ops@guardian.co.uk")
);
const clusterName = "elk";
const metric = (metricName: string) => {
return new Metric({
metricName,
namespace: `${this.stack}/${clusterName}`,
dimensionsMap: {
Cluster: clusterName,
},
});
};
const commonAlarmProps = {
app,
snsTopicName: topicForElasticsearchAlerts.topicName,
period: Duration.minutes(1),
};
const lessThanAlarmProps = {
...commonAlarmProps,
comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
statistic: "Minimum",
};
const greaterThanAlarmProps = {
...commonAlarmProps,
comparisonOperator: ComparisonOperator.GREATER_THAN_THRESHOLD,
statistic: "Maximum",
};
new GuAlarm(this, "DataNodeCountAlarm", {
...lessThanAlarmProps,
alarmDescription: `Unexpected count of data nodes in ${this.stage}. Runbook: https://docs.google.com/document/d/1PuEvL7L-CTV72Jx4OmiB3y5hlMmJR7Xz-YxYWAMiGdY`,
evaluationPeriods: 2,
metric: metric("NumberOfDataNodes"),
threshold: 5,
});
new GuAlarm(this, "MasterNodeCountAlarm", {
...lessThanAlarmProps,
alarmDescription: `Unexpected count of master nodes in ${this.stage}. Runbook: https://docs.google.com/document/d/1PuEvL7L-CTV72Jx4OmiB3y5hlMmJR7Xz-YxYWAMiGdY`,
evaluationPeriods: 2,
metric: metric("NumberOfRespondingMastersNodes"),
threshold: 3,
});
new GuAlarm(this, "ClusterStatusAlarm", {
...greaterThanAlarmProps,
alarmDescription: `Unexpected cluster status in ${this.stage}. See cloudwatch metric value for current cluster status (Green = 0, Yellow = 1, Red = 2). Runbook: https://docs.google.com/document/d/1PuEvL7L-CTV72Jx4OmiB3y5hlMmJR7Xz-YxYWAMiGdY`,
evaluationPeriods: 30, // Tolerate yellow status for 30 mins before sending an alert
metric: metric("Status"),
// Green = 0, Yellow = 1, Red = 2
threshold: 0,
});
new GuAlarm(this, "RedClusterStatusAlarm", {
...greaterThanAlarmProps,
alarmDescription: `Red cluster status in ${this.stage}. See cloudwatch metric value for current cluster status (Green = 0, Yellow = 1, Red = 2). Runbook: https://docs.google.com/document/d/1PuEvL7L-CTV72Jx4OmiB3y5hlMmJR7Xz-YxYWAMiGdY`,
evaluationPeriods: 1,
metric: metric("Status"),
// Green = 0, Yellow = 1, Red = 2
threshold: 1,
});
new GuAlarm(this, "DataNodeJvmHeapUsageAlarm", {
...greaterThanAlarmProps,
alarmDescription: `A data node is using too much of its heap in ${this.stage}`,
evaluationPeriods: 2,
metric: metric("MaxJvmHeapUsage"),
threshold: 85,
});
const lowStorageDescription = `A data node is running on less than 80% disk space.
For more context and troubleshooting instructions, see the runbook:
https://docs.google.com/document/d/1PuEvL7L-CTV72Jx4OmiB3y5hlMmJR7Xz-YxYWAMiGdY/edit#heading=h.8h00c65wqmv0`;
// See: https://aws.amazon.com/ec2/instance-types/i3en/ and https://www.google.com/search?q=5000gb+in+gib&oq=5000gb+in+gib
const totalStorage = Size.gibibytes(4657);
const twentyPercentDiskSpaceInBytes = Math.round(
totalStorage.toBytes() * 0.2
);
new GuAlarm(this, "DataNodeLowStorageAlarm", {
...lessThanAlarmProps,
alarmDescription: lowStorageDescription,
evaluationPeriods: 2,
metric: metric("MinAvailableDiskSpace"),
threshold: twentyPercentDiskSpaceInBytes,
});
}