cloudformation.yaml (521 lines of code) (raw):
AWSTemplateFormatVersion: "2010-09-09"
Description: "Step Function for rotating ElasticSearch nodes (regardless of their Stage, i.e. the INFRA/PROD instance of this stack will rotate oldest ES instances belonging to any Stage, assuming the instances have the `RotateWithElasticsearchNodeRotation` tag set to true)"
Parameters:
Stack:
Type: String
Description: Stack name defined in riff-raff.yaml to support per-account continuous deployment of this project. Not required if the template will be updated manually.
App:
Type: String
Description: App name defined in riff-raff.yaml to support per-account continuous deployment of this project
Default: elasticsearch-node-rotation
Stage:
Type: String
Description: Stage name for Riff-Raff deploys to support per-account continuous deployment of this project
Default: INFRA
DeployS3Bucket:
Type: String
Description: Bucket which contains .zip file used by lambda functions e.g. deploy-tools-dist.
DeployS3Key:
Type: String
Description: Key for .zip file used by lambda functions e.g. <stack>/<stage>/<app>/<app>.zip.
RotationCronExpression:
Type: String
Description: Cron expression which determines how often node rotation occurs.
AgeThresholdInDays:
Type: Number
Description: The number of days old an instance must be before it will be rotated. This avoids excessively rotating instances.
Default: 7
SsmOutputBucketName:
Type: String
Description: OPTIONAL! Bucket used to store SSM command output. The instances which receive SSM commands must have PutObject permissions for this bucket. If this is not provided a bucket will be created as part of this stack.
Default: ""
SNSTopicForAlerts:
Type: String
Description: The name of the SNS topic used for alerting in the case of a failed rotation attempt.
ClusterSizeCheckAttempts:
Type: Number
Description: The number of attempts to make when running the cluster size check before failing. See ClusterSizeCheck.IntervalSeconds to calculate the duration this will represent.
Default: 20
Conditions:
ShouldCreateSsmOutputBucket: !Equals
- !Ref SsmOutputBucketName
- ""
Resources:
NodeRotationSsmOutputBucket:
Type: AWS::S3::Bucket
Condition: ShouldCreateSsmOutputBucket
Properties:
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: aws:kms
PublicAccessBlockConfiguration:
BlockPublicAcls: true
IgnorePublicAcls: true
BlockPublicPolicy: true
RestrictPublicBuckets: true
LifecycleConfiguration:
Rules:
- Id: RetentionRule
Status: Enabled
Transitions:
- TransitionInDays: 1
StorageClass: GLACIER
ExpirationInDays: 14
Tags:
- { Key: Stack, Value: !Ref Stack }
- { Key: Stage, Value: !Ref Stage }
NodeRotationBucketSsmParameter:
Type: AWS::SSM::Parameter
Properties:
Name: "/account/services/node-rotation-ssm-output-bucket-name"
Type: String
Value: !If [ShouldCreateSsmOutputBucket, !Ref NodeRotationSsmOutputBucket, !Ref SsmOutputBucketName]
NodeRotationLambdaRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub ${Stack}-NodeRotation-${Stage}
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service:
- lambda.amazonaws.com
Action:
- sts:AssumeRole
Path: /
Policies:
- PolicyName: LambdaPolicy
PolicyDocument:
Statement:
- Effect: Allow
Action:
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
- lambda:InvokeFunction
Resource: "*"
- PolicyName: ElasticsearchAdminAsgPolicy
PolicyDocument:
Statement:
- Effect: Allow
Action:
- autoscaling:DescribeAutoScalingGroups
- ec2:DescribeInstances
Resource: "*"
- Effect: Allow
Action:
- autoscaling:DetachInstances
- autoscaling:AttachInstances
- autoscaling:TerminateInstanceInAutoScalingGroup
Resource:
- !Sub arn:aws:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/*
Condition:
StringLike:
autoscaling:ResourceTag/RotateWithElasticsearchNodeRotation: "true"
- PolicyName: ElasticsearchAdminSsmPolicy
PolicyDocument:
Statement:
- Effect: Allow
Action: ssm:GetCommandInvocation
# This cannot be restricted further but that's OK, it's just reading the output
Resource: "*"
- Effect: Allow
Action: ssm:SendCommand
Resource: !Sub arn:aws:ssm:${AWS::Region}::document/AWS-RunShellScript
- Effect: Allow
Action: ssm:SendCommand
Resource: !Sub arn:aws:ec2:${AWS::Region}:${AWS::AccountId}:instance/*
Condition:
StringLike:
ssm:resourceTag/RotateWithElasticsearchNodeRotation: "true"
- PolicyName: SsmS3Policy
PolicyDocument:
Statement:
- Effect: Allow
Action: s3:ListBucket
Resource:
- !Sub arn:aws:s3:::${NodeRotationBucketSsmParameter.Value}
- Effect: Allow
Action: s3:GetObject
Resource:
- !Sub arn:aws:s3:::${NodeRotationBucketSsmParameter.Value}/*
- PolicyName: QueryStepFunctionHistory
PolicyDocument:
Statement:
- Effect: Allow
Action: states:ListExecutions
Resource: "*"
StatesExecutionRole:
Type: "AWS::IAM::Role"
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
Effect: "Allow"
Principal:
Service: !Sub states.${AWS::Region}.amazonaws.com
Action: "sts:AssumeRole"
Path: "/"
Policies:
- PolicyName: StatesExecutionPolicy
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: "*"
TriggerExecutionRole:
Type: "AWS::IAM::Role"
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
Effect: Allow
Principal:
Service:
- events.amazonaws.com
Action: sts:AssumeRole
Path: "/"
Policies:
- PolicyName: StatesExecutionPolicy
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource: !Ref NodeRotationStepFunction
GetTargetNodeLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-get-target-node-${Stage}
Description: "Looks up by tag of the autoscaling group the most suitable instance/node to rotate"
Handler: "getTargetNode.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
ClusterStatusCheckLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-cluster-status-check-${Stage}
Description: "Checks the status of an Elasticsearch cluster"
Handler: "clusterStatusCheck.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
AutoScalingGroupCheckLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-auto-scaling-group-check-${Stage}
Description: "Checks that a single Auto Scaling Group is returned with a maximum limit greater than the desired capacity"
Handler: "autoScalingGroupCheck.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
AddNodeLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-add-node-${Stage}
Description: "Disables re-balancing before adding a new node into the Elasticsearch cluster"
Handler: "addNode.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
ClusterSizeCheckLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-cluster-size-check-${Stage}
Description: "Confirms that the Elasticsearch cluster is the expected size"
Handler: "clusterSizeCheck.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
ReattachTargetInstanceLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-reattach-target-instance-${Stage}
Description: "Reattaches target instance to the ASG"
Handler: "reattachTargetInstance.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
MigrateShardsLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-migrate-shards-${Stage}
Description: "Migrates shards between two nodes"
Handler: "migrateShards.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
ShardMigrationCheckLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-shard-migration-check-${Stage}
Description: "Confirms that all shards have been migrated (and that cluster is green) or exits with an error"
Handler: "shardMigrationCheck.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
RemoveNodeLambda:
Type: "AWS::Lambda::Function"
DependsOn: NodeRotationLambdaRole
Properties:
Architectures: [ "arm64" ]
FunctionName: !Sub ${Stack}-enr-remove-node-${Stage}
Description: "Removes the target node (typically oldest) from the cluster (and terminates the instance) before re-enabling re-balancing"
Handler: "removeNode.handler"
Role: !GetAtt [ NodeRotationLambdaRole, Arn ]
Code:
S3Bucket: !Sub ${DeployS3Bucket}
S3Key: !Sub ${DeployS3Key}
MemorySize: 512
Runtime: nodejs20.x
Timeout: 300
Environment:
Variables:
SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value
NodeRotationStepFunction:
Type: "AWS::StepFunctions::StateMachine"
DependsOn:
- GetTargetNodeLambda
- AutoScalingGroupCheckLambda
- ClusterStatusCheckLambda
- AddNodeLambda
- ClusterSizeCheckLambda
- ReattachTargetInstanceLambda
- MigrateShardsLambda
- ShardMigrationCheckLambda
- RemoveNodeLambda
Properties:
StateMachineName:
!Sub ${Stack}-Elasticsearch-Node-Rotation-${Stage}
DefinitionString:
!Sub
- |
{
"Comment": "Elasticsearch Node Rotation",
"StartAt": "GetTargetNode",
"States": {
"GetTargetNode": {
"Type": "Task",
"Resource": "${GetTargetNodeArn}",
"Next": "CheckSkipRotation"
},
"CheckSkipRotation": {
"Type": "Choice",
"Choices": [
{
"Variable": "$.skipRotation",
"BooleanEquals": false,
"Next": "AutoScalingGroupCheck"
}
],
"Default": "StopAsSkippingRotation"
},
"StopAsSkippingRotation": {
"Type": "Succeed"
},
"AutoScalingGroupCheck": {
"Type": "Task",
"Resource": "${AutoScalingGroupCheckArn}",
"Next": "CheckClusterStatus"
},
"CheckClusterStatus": {
"Type": "Task",
"Resource": "${ClusterStatusCheckArn}",
"Next": "StatusIsGreen"
},
"StatusIsGreen": {
"Type": "Choice",
"Choices": [
{
"Not": {
"Variable": "$.clusterStatus",
"StringEquals": "green"
},
"Next": "FailState"
}
],
"Default": "AddNode"
},
"FailState": {
"Type": "Fail",
"Cause": "Unhealthy Cluster!"
},
"AddNode": {
"Type": "Task",
"Resource": "${AddNodeArn}",
"Next": "ClusterSizeCheck"
},
"ClusterSizeCheck": {
"Type": "Task",
"Resource": "${ClusterSizeCheckArn}",
"Next": "ReattachTargetInstance",
"Retry": [ {
"ErrorEquals": [ "States.ALL" ],
"IntervalSeconds": 30,
"MaxAttempts": ${ClusterSizeCheckAttempts},
"BackoffRate": 1.0
} ]
},
"ReattachTargetInstance": {
"Type": "Task",
"Resource": "${ReattachTargetInstanceArn}",
"Next": "MigrateShards"
},
"MigrateShards": {
"Type": "Task",
"Resource": "${MigrateShardsArn}",
"Next": "ShardMigrationCheck"
},
"ShardMigrationCheck": {
"Type": "Task",
"Resource": "${ShardMigrationCheckArn}",
"Next": "RemoveNode",
"Retry": [ {
"ErrorEquals": [ "States.ALL" ],
"IntervalSeconds": 120,
"MaxAttempts": 195,
"BackoffRate": 1.0
} ]
},
"RemoveNode": {
"Type": "Task",
"Resource": "${RemoveNodeArn}",
"End": true
}
}
}
-
AutoScalingGroupCheckArn: !GetAtt AutoScalingGroupCheckLambda.Arn
ClusterStatusCheckArn: !GetAtt ClusterStatusCheckLambda.Arn
AddNodeArn: !GetAtt AddNodeLambda.Arn
ClusterSizeCheckArn: !GetAtt ClusterSizeCheckLambda.Arn
ReattachTargetInstanceArn: !GetAtt ReattachTargetInstanceLambda.Arn
MigrateShardsArn: !GetAtt MigrateShardsLambda.Arn
ShardMigrationCheckArn: !GetAtt ShardMigrationCheckLambda.Arn
RemoveNodeArn: !GetAtt RemoveNodeLambda.Arn
GetTargetNodeArn: !GetAtt GetTargetNodeLambda.Arn
RoleArn: !GetAtt StatesExecutionRole.Arn
NodeRotationSchedule:
Type: AWS::Events::Rule
Properties:
Name: !Sub ${Stack}-node-rotation-schedule-${Stage}
ScheduleExpression: !Ref RotationCronExpression
State: ENABLED
Targets:
- Arn: !Ref NodeRotationStepFunction
RoleArn: !GetAtt TriggerExecutionRole.Arn
Id: !GetAtt NodeRotationStepFunction.Name
Input: !Sub
- |
{
"autoScalingGroupDiscoveryTagKey": "RotateWithElasticsearchNodeRotation",
"ageThresholdInDays": ${AgeThresholdInDays},
"stepFunctionArn": "${StepFunctionArn}",
"targetInstanceId": null
}
-
StepFunctionArn: !Ref NodeRotationStepFunction
DependsOn:
- NodeRotationStepFunction
- TriggerExecutionRole
ExecutionFailureAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:${SNSTopicForAlerts}
OKActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:${SNSTopicForAlerts}
AlarmName: !Sub Failed to complete node rotation for an elasticsearch cluster
AlarmDescription: !Sub
- >
Elasticsearch Node Rotation failed - please see Step Function execution history for ${StepFunctionName}.
You can find the affected auto scaling group by clicking on the AutoScalingGroupCheck node in the execution history and clicking "Step Input"
- { StepFunctionName: !GetAtt NodeRotationStepFunction.Name }
MetricName: ExecutionsFailed
Namespace: AWS/States
Dimensions:
- Name: StateMachineArn
Value: !Ref NodeRotationStepFunction
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
Period: 60
EvaluationPeriods: 1
Statistic: Sum
TreatMissingData: ignore
DependsOn: NodeRotationStepFunction