in source/patterns/@aws-solutions-constructs/core/lib/elasticsearch-helper.ts [90:221]
export function buildElasticSearchCWAlarms(scope: Construct): cloudwatch.Alarm[] {
// Setup CW Alarms for ES
const alarms: cloudwatch.Alarm[] = new Array();
// ClusterStatus.red maximum is >= 1 for 1 minute, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'StatusRedAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'ClusterStatus.red',
statistic: 'Maximum',
period: cdk.Duration.seconds(60),
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'At least one primary shard and its replicas are not allocated to a node. '
}));
// ClusterStatus.yellow maximum is >= 1 for 1 minute, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'StatusYellowAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'ClusterStatus.yellow',
statistic: 'Maximum',
period: cdk.Duration.seconds(60),
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'At least one replica shard is not allocated to a node.'
}));
// FreeStorageSpace minimum is <= 20480 for 1 minute, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'FreeStorageSpaceTooLowAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'FreeStorageSpace',
statistic: 'Minimum',
period: cdk.Duration.seconds(60),
}),
threshold: 20000,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'A node in your cluster is down to 20 GiB of free storage space.'
}));
// ClusterIndexWritesBlocked is >= 1 for 5 minutes, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'IndexWritesBlockedTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'ClusterIndexWritesBlocked',
statistic: 'Maximum',
period: cdk.Duration.seconds(300),
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'Your cluster is blocking write requests.'
}));
// AutomatedSnapshotFailure maximum is >= 1 for 1 minute, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'AutomatedSnapshotFailureTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'AutomatedSnapshotFailure',
statistic: 'Maximum',
period: cdk.Duration.seconds(60),
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'An automated snapshot failed. This failure is often the result of a red cluster health status.'
}));
// CPUUtilization maximum is >= 80% for 15 minutes, 3 consecutive times
alarms.push(new cloudwatch.Alarm(scope, 'CPUUtilizationTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'CPUUtilization',
statistic: 'Average',
period: cdk.Duration.seconds(900),
}),
threshold: 80,
evaluationPeriods: 3,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: '100% CPU utilization is not uncommon, but sustained high usage is problematic. Consider using larger instance types or adding instances.'
}));
// JVMMemoryPressure maximum is >= 80% for 5 minutes, 3 consecutive times
alarms.push(new cloudwatch.Alarm(scope, 'JVMMemoryPressureTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'JVMMemoryPressure',
statistic: 'Average',
period: cdk.Duration.seconds(900),
}),
threshold: 80,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'Average JVM memory pressure over last 15 minutes too high. Consider scaling vertically.'
}));
// MasterCPUUtilization maximum is >= 50% for 15 minutes, 3 consecutive times
alarms.push(new cloudwatch.Alarm(scope, 'MasterCPUUtilizationTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'MasterCPUUtilization',
statistic: 'Average',
period: cdk.Duration.seconds(900),
}),
threshold: 50,
evaluationPeriods: 3,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'Average CPU utilization over last 45 minutes too high. Consider using larger instance types for your dedicated master nodes.'
}));
// MasterJVMMemoryPressure maximum is >= 80% for 15 minutes, 1 consecutive time
alarms.push(new cloudwatch.Alarm(scope, 'MasterJVMMemoryPressureTooHighAlarm', {
metric: new cloudwatch.Metric({
namespace: 'AWS/ES',
metricName: 'MasterJVMMemoryPressure',
statistic: 'Average',
period: cdk.Duration.seconds(900),
}),
threshold: 50,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription: 'Average JVM memory pressure over last 15 minutes too high. Consider scaling vertically.'
}));
return alarms;
}