cloudformation.yaml (521 lines of code) (raw):

AWSTemplateFormatVersion: "2010-09-09" Description: "Step Function for rotating ElasticSearch nodes (regardless of their Stage, i.e. the INFRA/PROD instance of this stack will rotate oldest ES instances belonging to any Stage, assuming the instances have the `RotateWithElasticsearchNodeRotation` tag set to true)" Parameters: Stack: Type: String Description: Stack name defined in riff-raff.yaml to support per-account continuous deployment of this project. Not required if the template will be updated manually. App: Type: String Description: App name defined in riff-raff.yaml to support per-account continuous deployment of this project Default: elasticsearch-node-rotation Stage: Type: String Description: Stage name for Riff-Raff deploys to support per-account continuous deployment of this project Default: INFRA DeployS3Bucket: Type: String Description: Bucket which contains .zip file used by lambda functions e.g. deploy-tools-dist. DeployS3Key: Type: String Description: Key for .zip file used by lambda functions e.g. <stack>/<stage>/<app>/<app>.zip. RotationCronExpression: Type: String Description: Cron expression which determines how often node rotation occurs. AgeThresholdInDays: Type: Number Description: The number of days old an instance must be before it will be rotated. This avoids excessively rotating instances. Default: 7 SsmOutputBucketName: Type: String Description: OPTIONAL! Bucket used to store SSM command output. The instances which receive SSM commands must have PutObject permissions for this bucket. If this is not provided a bucket will be created as part of this stack. Default: "" SNSTopicForAlerts: Type: String Description: The name of the SNS topic used for alerting in the case of a failed rotation attempt. ClusterSizeCheckAttempts: Type: Number Description: The number of attempts to make when running the cluster size check before failing. See ClusterSizeCheck.IntervalSeconds to calculate the duration this will represent. Default: 20 Conditions: ShouldCreateSsmOutputBucket: !Equals - !Ref SsmOutputBucketName - "" Resources: NodeRotationSsmOutputBucket: Type: AWS::S3::Bucket Condition: ShouldCreateSsmOutputBucket Properties: BucketEncryption: ServerSideEncryptionConfiguration: - ServerSideEncryptionByDefault: SSEAlgorithm: aws:kms PublicAccessBlockConfiguration: BlockPublicAcls: true IgnorePublicAcls: true BlockPublicPolicy: true RestrictPublicBuckets: true LifecycleConfiguration: Rules: - Id: RetentionRule Status: Enabled Transitions: - TransitionInDays: 1 StorageClass: GLACIER ExpirationInDays: 14 Tags: - { Key: Stack, Value: !Ref Stack } - { Key: Stage, Value: !Ref Stage } NodeRotationBucketSsmParameter: Type: AWS::SSM::Parameter Properties: Name: "/account/services/node-rotation-ssm-output-bucket-name" Type: String Value: !If [ShouldCreateSsmOutputBucket, !Ref NodeRotationSsmOutputBucket, !Ref SsmOutputBucketName] NodeRotationLambdaRole: Type: AWS::IAM::Role Properties: RoleName: !Sub ${Stack}-NodeRotation-${Stage} AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: - lambda.amazonaws.com Action: - sts:AssumeRole Path: / Policies: - PolicyName: LambdaPolicy PolicyDocument: Statement: - Effect: Allow Action: - logs:CreateLogGroup - logs:CreateLogStream - logs:PutLogEvents - lambda:InvokeFunction Resource: "*" - PolicyName: ElasticsearchAdminAsgPolicy PolicyDocument: Statement: - Effect: Allow Action: - autoscaling:DescribeAutoScalingGroups - ec2:DescribeInstances Resource: "*" - Effect: Allow Action: - autoscaling:DetachInstances - autoscaling:AttachInstances - autoscaling:TerminateInstanceInAutoScalingGroup Resource: - !Sub arn:aws:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/* Condition: StringLike: autoscaling:ResourceTag/RotateWithElasticsearchNodeRotation: "true" - PolicyName: ElasticsearchAdminSsmPolicy PolicyDocument: Statement: - Effect: Allow Action: ssm:GetCommandInvocation # This cannot be restricted further but that's OK, it's just reading the output Resource: "*" - Effect: Allow Action: ssm:SendCommand Resource: !Sub arn:aws:ssm:${AWS::Region}::document/AWS-RunShellScript - Effect: Allow Action: ssm:SendCommand Resource: !Sub arn:aws:ec2:${AWS::Region}:${AWS::AccountId}:instance/* Condition: StringLike: ssm:resourceTag/RotateWithElasticsearchNodeRotation: "true" - PolicyName: SsmS3Policy PolicyDocument: Statement: - Effect: Allow Action: s3:ListBucket Resource: - !Sub arn:aws:s3:::${NodeRotationBucketSsmParameter.Value} - Effect: Allow Action: s3:GetObject Resource: - !Sub arn:aws:s3:::${NodeRotationBucketSsmParameter.Value}/* - PolicyName: QueryStepFunctionHistory PolicyDocument: Statement: - Effect: Allow Action: states:ListExecutions Resource: "*" StatesExecutionRole: Type: "AWS::IAM::Role" Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: Effect: "Allow" Principal: Service: !Sub states.${AWS::Region}.amazonaws.com Action: "sts:AssumeRole" Path: "/" Policies: - PolicyName: StatesExecutionPolicy PolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Action: - lambda:InvokeFunction Resource: "*" TriggerExecutionRole: Type: "AWS::IAM::Role" Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: Effect: Allow Principal: Service: - events.amazonaws.com Action: sts:AssumeRole Path: "/" Policies: - PolicyName: StatesExecutionPolicy PolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Action: - states:StartExecution Resource: !Ref NodeRotationStepFunction GetTargetNodeLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-get-target-node-${Stage} Description: "Looks up by tag of the autoscaling group the most suitable instance/node to rotate" Handler: "getTargetNode.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value ClusterStatusCheckLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-cluster-status-check-${Stage} Description: "Checks the status of an Elasticsearch cluster" Handler: "clusterStatusCheck.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value AutoScalingGroupCheckLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-auto-scaling-group-check-${Stage} Description: "Checks that a single Auto Scaling Group is returned with a maximum limit greater than the desired capacity" Handler: "autoScalingGroupCheck.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value AddNodeLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-add-node-${Stage} Description: "Disables re-balancing before adding a new node into the Elasticsearch cluster" Handler: "addNode.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value ClusterSizeCheckLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-cluster-size-check-${Stage} Description: "Confirms that the Elasticsearch cluster is the expected size" Handler: "clusterSizeCheck.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value ReattachTargetInstanceLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-reattach-target-instance-${Stage} Description: "Reattaches target instance to the ASG" Handler: "reattachTargetInstance.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value MigrateShardsLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-migrate-shards-${Stage} Description: "Migrates shards between two nodes" Handler: "migrateShards.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value ShardMigrationCheckLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-shard-migration-check-${Stage} Description: "Confirms that all shards have been migrated (and that cluster is green) or exits with an error" Handler: "shardMigrationCheck.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value RemoveNodeLambda: Type: "AWS::Lambda::Function" DependsOn: NodeRotationLambdaRole Properties: Architectures: [ "arm64" ] FunctionName: !Sub ${Stack}-enr-remove-node-${Stage} Description: "Removes the target node (typically oldest) from the cluster (and terminates the instance) before re-enabling re-balancing" Handler: "removeNode.handler" Role: !GetAtt [ NodeRotationLambdaRole, Arn ] Code: S3Bucket: !Sub ${DeployS3Bucket} S3Key: !Sub ${DeployS3Key} MemorySize: 512 Runtime: nodejs20.x Timeout: 300 Environment: Variables: SSM_BUCKET_NAME: !GetAtt NodeRotationBucketSsmParameter.Value NodeRotationStepFunction: Type: "AWS::StepFunctions::StateMachine" DependsOn: - GetTargetNodeLambda - AutoScalingGroupCheckLambda - ClusterStatusCheckLambda - AddNodeLambda - ClusterSizeCheckLambda - ReattachTargetInstanceLambda - MigrateShardsLambda - ShardMigrationCheckLambda - RemoveNodeLambda Properties: StateMachineName: !Sub ${Stack}-Elasticsearch-Node-Rotation-${Stage} DefinitionString: !Sub - | { "Comment": "Elasticsearch Node Rotation", "StartAt": "GetTargetNode", "States": { "GetTargetNode": { "Type": "Task", "Resource": "${GetTargetNodeArn}", "Next": "CheckSkipRotation" }, "CheckSkipRotation": { "Type": "Choice", "Choices": [ { "Variable": "$.skipRotation", "BooleanEquals": false, "Next": "AutoScalingGroupCheck" } ], "Default": "StopAsSkippingRotation" }, "StopAsSkippingRotation": { "Type": "Succeed" }, "AutoScalingGroupCheck": { "Type": "Task", "Resource": "${AutoScalingGroupCheckArn}", "Next": "CheckClusterStatus" }, "CheckClusterStatus": { "Type": "Task", "Resource": "${ClusterStatusCheckArn}", "Next": "StatusIsGreen" }, "StatusIsGreen": { "Type": "Choice", "Choices": [ { "Not": { "Variable": "$.clusterStatus", "StringEquals": "green" }, "Next": "FailState" } ], "Default": "AddNode" }, "FailState": { "Type": "Fail", "Cause": "Unhealthy Cluster!" }, "AddNode": { "Type": "Task", "Resource": "${AddNodeArn}", "Next": "ClusterSizeCheck" }, "ClusterSizeCheck": { "Type": "Task", "Resource": "${ClusterSizeCheckArn}", "Next": "ReattachTargetInstance", "Retry": [ { "ErrorEquals": [ "States.ALL" ], "IntervalSeconds": 30, "MaxAttempts": ${ClusterSizeCheckAttempts}, "BackoffRate": 1.0 } ] }, "ReattachTargetInstance": { "Type": "Task", "Resource": "${ReattachTargetInstanceArn}", "Next": "MigrateShards" }, "MigrateShards": { "Type": "Task", "Resource": "${MigrateShardsArn}", "Next": "ShardMigrationCheck" }, "ShardMigrationCheck": { "Type": "Task", "Resource": "${ShardMigrationCheckArn}", "Next": "RemoveNode", "Retry": [ { "ErrorEquals": [ "States.ALL" ], "IntervalSeconds": 120, "MaxAttempts": 195, "BackoffRate": 1.0 } ] }, "RemoveNode": { "Type": "Task", "Resource": "${RemoveNodeArn}", "End": true } } } - AutoScalingGroupCheckArn: !GetAtt AutoScalingGroupCheckLambda.Arn ClusterStatusCheckArn: !GetAtt ClusterStatusCheckLambda.Arn AddNodeArn: !GetAtt AddNodeLambda.Arn ClusterSizeCheckArn: !GetAtt ClusterSizeCheckLambda.Arn ReattachTargetInstanceArn: !GetAtt ReattachTargetInstanceLambda.Arn MigrateShardsArn: !GetAtt MigrateShardsLambda.Arn ShardMigrationCheckArn: !GetAtt ShardMigrationCheckLambda.Arn RemoveNodeArn: !GetAtt RemoveNodeLambda.Arn GetTargetNodeArn: !GetAtt GetTargetNodeLambda.Arn RoleArn: !GetAtt StatesExecutionRole.Arn NodeRotationSchedule: Type: AWS::Events::Rule Properties: Name: !Sub ${Stack}-node-rotation-schedule-${Stage} ScheduleExpression: !Ref RotationCronExpression State: ENABLED Targets: - Arn: !Ref NodeRotationStepFunction RoleArn: !GetAtt TriggerExecutionRole.Arn Id: !GetAtt NodeRotationStepFunction.Name Input: !Sub - | { "autoScalingGroupDiscoveryTagKey": "RotateWithElasticsearchNodeRotation", "ageThresholdInDays": ${AgeThresholdInDays}, "stepFunctionArn": "${StepFunctionArn}", "targetInstanceId": null } - StepFunctionArn: !Ref NodeRotationStepFunction DependsOn: - NodeRotationStepFunction - TriggerExecutionRole ExecutionFailureAlarm: Type: AWS::CloudWatch::Alarm Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:${SNSTopicForAlerts} OKActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:${SNSTopicForAlerts} AlarmName: !Sub Failed to complete node rotation for an elasticsearch cluster AlarmDescription: !Sub - > Elasticsearch Node Rotation failed - please see Step Function execution history for ${StepFunctionName}. You can find the affected auto scaling group by clicking on the AutoScalingGroupCheck node in the execution history and clicking "Step Input" - { StepFunctionName: !GetAtt NodeRotationStepFunction.Name } MetricName: ExecutionsFailed Namespace: AWS/States Dimensions: - Name: StateMachineArn Value: !Ref NodeRotationStepFunction ComparisonOperator: GreaterThanOrEqualToThreshold Threshold: 1 Period: 60 EvaluationPeriods: 1 Statistic: Sum TreatMissingData: ignore DependsOn: NodeRotationStepFunction