cloudformation/membership-attribute-service.yaml (552 lines of code) (raw):
AWSTemplateFormatVersion: '2010-09-09'
Description: Membership Attributes service
Parameters:
Stage:
Description: Environment name
Type: String
Default: PROD
InstanceType:
Description: EC2 instance type
Type: String
Default: t4g.small
AllowedValues:
- t4g.small
ConstraintDescription: must be a valid EC2 instance type.
VpcId:
Description: ID of the VPC onto which to launch the application
Type: AWS::EC2::VPC::Id
Default: vpc-e6e00183
PrivateVpcSubnets:
Description: Private subnets to use for EC2 instances
Type: List<AWS::EC2::Subnet::Id>
PublicVpcSubnets:
Description: Public subnets to use for the ELB
Type: List<AWS::EC2::Subnet::Id>
AmiId:
Description: Custom AMI to use for instances
Type: String
ELBSSLCertificate:
Description: ELB SSL Certificate ARN
Type: String
SecurityGroupForPostgres:
Description: Security group for querying the postgres database
Type: String
Mappings:
Constants:
Alarm:
Process: Follow the process in https://docs.google.com/document/d/1_3El3cly9d7u_jPgTcRjLxmdG2e919zCLvmcFCLOYAk/edit
MetricFilters:
MetricNamespace: "members-data-api"
StageVariables:
PROD:
MaxInstances: 12 # This should be (at least) double the desired capacity.
MinInstances: 3
NotificationAlarmPeriod: 1200
InstanceName: PROD:membership-attribute-service
DynamoDBSupporterProductDataTables:
- arn:aws:dynamodb:*:*:table/SupporterProductData-PROD
- arn:aws:dynamodb:*:*:table/SupporterProductData-CODE
ReadableS3Resources:
- arn:aws:s3:::gu-membership-attribute-service-dist/membership/PROD/*
- arn:aws:s3:::gu-reader-revenue-private/membership/members-data-api/PROD/*
MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName: "PROD-default-payment-method-set-to-nothing" # can't use !Sub for Stage inside a mapping definition, sigh
MembersDataUnsupportedProductRatePlanIdMetricFilterName: "PROD-unsupported-product-rate-plan-id-encountered" # can't use !Sub for Stage inside a mapping definition, sigh
MembersDataHttpQueuesFullMetricFilterName: "PROD-http-client-queue-full"
Urgent: "URGENT 9-5 -"
CODE:
MaxInstances: 2
MinInstances: 1
NotificationAlarmPeriod: 1200
InstanceName: CODE:membership-attribute-service
DynamoDBSupporterProductDataTables:
- arn:aws:dynamodb:*:*:table/SupporterProductData-CODE
ReadableS3Resources:
- arn:aws:s3:::gu-membership-attribute-service-dist/membership/CODE/*
- arn:aws:s3:::gu-reader-revenue-private/membership/members-data-api/CODE/*
MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName: "CODE-default-payment-method-set-to-nothing"
MembersDataUnsupportedProductRatePlanIdMetricFilterName: "CODE-unsupported-product-rate-plan-id-encountered"
MembersDataHttpQueuesFullMetricFilterName: "CODE-http-client-queue-full"
Urgent: "Warning"
Conditions:
CreateProdMonitoring: !Equals [ !Ref Stage, PROD ]
CreateUnsupportedProductRatePlanAlarm: !Equals [ 1, 2 ] # Disabling the alarm until we can ensure promotions are not longer put in Dymamo (which will trigger the alarm)
Create4XXRatioAlarm: !Equals [ 1, 2 ] # disabled while we have a high rate of 401s for android app requests
Resources:
MembershipRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub members-data-api-${Stage}
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service:
- ec2.amazonaws.com
Action:
- sts:AssumeRole
Path: "/"
Policies:
- PolicyName: root
PolicyDocument:
Statement:
# Explicitly deny access to all S3 resources except for those defined in ReadableS3Resources
# https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic.html#policy-eval-denyallow
- Effect: Deny
Action: s3:*
NotResource: !FindInMap [StageVariables, !Ref Stage, ReadableS3Resources]
- Effect: Allow
Action: s3:GetObject
Resource: !FindInMap [StageVariables, !Ref Stage, ReadableS3Resources]
- Action:
- dynamodb:Query
- dynamodb:DescribeTable
Resource:
Fn::FindInMap: [ StageVariables, { Ref: Stage }, DynamoDBSupporterProductDataTables ]
Effect: Allow
- Action:
- cloudwatch:*
Resource: "*"
Effect: Allow
- Action:
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
Resource: !GetAtt MembersDataApiLogGroup.Arn
Effect: Allow
- Action:
- sqs:GetQueueUrl
- sqs:SendMessage
Resource:
Fn::ImportValue:
!Sub "comms-${Stage}-EmailQueueArn"
Effect: Allow
ManagedPolicyArns:
- !Sub arn:aws:iam::${AWS::AccountId}:policy/guardian-ec2-role-for-ssm
SSMRunCommandPolicy:
Type: AWS::IAM::Policy
Properties:
PolicyName: ssm-run-command-policy
PolicyDocument:
Statement:
# minimal policy to allow running commands via ssm
- Effect: Allow
Resource: "*"
Action:
- ec2messages:AcknowledgeMessage
- ec2messages:DeleteMessage
- ec2messages:FailMessage
- ec2messages:GetEndpoint
- ec2messages:GetMessages
- ec2messages:SendReply
- ssm:UpdateInstanceInformation
- ssm:ListInstanceAssociations
- ssm:DescribeInstanceProperties
- ssm:DescribeDocumentParameters
- ssmmessages:CreateControlChannel
- ssmmessages:CreateDataChannel
- ssmmessages:OpenControlChannel
- ssmmessages:OpenDataChannel
Roles:
- !Ref MembershipRole
InstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
Path: "/"
Roles:
- Ref: MembershipRole
LoadBalancer:
Type: AWS::ElasticLoadBalancing::LoadBalancer
Properties:
Listeners:
- LoadBalancerPort: '443'
InstancePort: '9000'
Protocol: HTTPS
SSLCertificateId: !Ref ELBSSLCertificate
ConnectionDrainingPolicy:
Enabled: 'true'
Timeout: '60'
CrossZone: 'true'
HealthCheck:
Target: HTTP:9000/healthcheck
HealthyThreshold: '2'
UnhealthyThreshold: '3'
Interval: '10'
Timeout: '5'
Subnets:
Ref: PublicVpcSubnets
SecurityGroups:
- Ref: LoadBalancerSecurityGroup
AutoscalingGroup:
Type: AWS::AutoScaling::AutoScalingGroup
Properties:
AutoScalingGroupName: !Sub members-data-api-${Stage}
LaunchConfigurationName:
Ref: LaunchConfig
MinSize: !FindInMap [ StageVariables, !Ref Stage, MinInstances ]
MaxSize: !FindInMap [ StageVariables, !Ref Stage, MaxInstances ]
HealthCheckType: ELB
HealthCheckGracePeriod: 400
LoadBalancerNames:
- Ref: LoadBalancer
Tags:
- Key: Stage
Value:
Ref: Stage
PropagateAtLaunch: 'true'
- Key: Name
Value:
Fn::FindInMap: [ StageVariables, { Ref: Stage }, InstanceName ]
PropagateAtLaunch: 'true'
- Key: Stack
Value: membership
PropagateAtLaunch: 'true'
- Key: App
Value: membership-attribute-service
PropagateAtLaunch: 'true'
- Key: Role
Value: membership-attribute-service
PropagateAtLaunch: 'true'
- Key: Mainclass
Value: membership-attribute-service
PropagateAtLaunch: 'true'
VPCZoneIdentifier:
Ref: PrivateVpcSubnets
LaunchConfig:
Type: AWS::AutoScaling::LaunchConfiguration
Properties:
ImageId:
Ref: AmiId
SecurityGroups:
- Ref: InstanceSecurityGroup
- Ref: SecurityGroupForPostgres
InstanceType:
Ref: InstanceType
MetadataOptions:
HttpTokens: required
AssociatePublicIpAddress: 'False'
IamInstanceProfile:
Ref: InstanceProfile
UserData:
"Fn::Base64":
!Sub
- |
#!/bin/bash -ev
CONF_DIR=/etc/membership-attribute-service
aws s3 cp s3://gu-membership-attribute-service-dist/membership/${Stage}/membership-attribute-service/membership-attribute-service_1.0-SNAPSHOT_all.deb /tmp
dpkg -i /tmp/membership-attribute-service_1.0-SNAPSHOT_all.deb
mkdir -p /etc/gu
aws s3 cp s3://gu-reader-revenue-private/membership/members-data-api/${Stage}/members-data-api.private.conf /etc/gu
chown membership-attribute-service /etc/gu/members-data-api.private.conf
chmod 0600 /etc/gu/members-data-api.private.conf
/opt/cloudwatch-logs/configure-logs application members ${Stage} data-api /var/log/membership-attribute-service/membership-attribute-service.log;
- {}
LoadBalancerSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
VpcId:
Ref: VpcId
GroupDescription: Open up HTTPS access to load balancer
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: '443'
ToPort: '443'
CidrIp: 0.0.0.0/0
SecurityGroupEgress:
- IpProtocol: tcp
FromPort: '9000'
ToPort: '9000'
CidrIp: 0.0.0.0/0
InstanceSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
VpcId:
Ref: VpcId
GroupDescription: Open up HTTP access to load balancer
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: '9000'
ToPort: '9000'
SourceSecurityGroupId:
Ref: LoadBalancerSecurityGroup
SecurityGroupEgress:
- IpProtocol: tcp
FromPort: '443'
ToPort: '443'
CidrIp: 0.0.0.0/0
MembersDataApiLogGroup:
Type: "AWS::Logs::LogGroup"
Properties:
LogGroupName: !Sub members-data-api-${Stage}
RetentionInDays: 14
MembersDataUnsupportedProductRatePlanIdMetricFilter:
Type: AWS::Logs::MetricFilter
DependsOn: MembersDataApiLogGroup
Properties:
FilterName: !Sub members-data-api-${Stage}-unsupported-product-rate-plan-id-metric-filter
FilterPattern: "\"Unsupported product rate plan id\""
LogGroupName: !Sub members-data-api-${Stage}
MetricTransformations:
- MetricValue: 1
DefaultValue: 0
MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataUnsupportedProductRatePlanIdMetricFilterName ]
MembersDataUnsupportedProductRatePlanIdAlarm:
Type: AWS::CloudWatch::Alarm
Condition: CreateUnsupportedProductRatePlanAlarm
DependsOn:
- MembersDataUnsupportedProductRatePlanIdMetricFilter
- MembersDataApiLogGroup
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName: !Join
- ' '
- - !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- 'Unsupported product rate plan id'
AlarmDescription: !Join
- ' '
- - "Impact - the /user-attributes/me output might be missing an attribute that should be set because a product rate plan id isn't supported."
- !FindInMap [ Constants, Alarm, Process ]
EvaluationPeriods: 1
Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataUnsupportedProductRatePlanIdMetricFilterName ]
Period: 600
Statistic: Sum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
TreatMissingData: notBreaching
MembersDataDefaultPaymentMethodLeftEmptyMetricFilter:
Type: AWS::Logs::MetricFilter
DependsOn: MembersDataApiLogGroup
Properties:
FilterName: !Sub members-data-api-${Stage}-default-payment-method-empty-metric-filter
FilterPattern: "\"default-payment-method-lost\""
LogGroupName: !Sub members-data-api-${Stage}
MetricTransformations:
- MetricValue: 1
DefaultValue: 0
MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName ]
MembersDataDefaultPaymentMethodLeftEmptyAlarm:
Type: AWS::CloudWatch::Alarm
DependsOn:
- MembersDataDefaultPaymentMethodLeftEmptyMetricFilter
- MembersDataApiLogGroup
Condition: CreateProdMonitoring
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName:
!Join
- ' '
-
- !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- 'Default Payment Method set to nothing'
AlarmDescription: !Join
- ' '
- - "Impact - a user has been left with no Default Payment method, so we can't take payment from them indefinitely"
- !FindInMap [ Constants, Alarm, Process ]
EvaluationPeriods: 1
Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName ]
Period: 3600
Statistic: Sum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
TreatMissingData: notBreaching
MembersDataHttpQueuesFullMetricFilter:
Type: AWS::Logs::MetricFilter
DependsOn: MembersDataApiLogGroup
Properties:
FilterName: !Sub members-data-api-${Stage}-http-queues-full-metric-filter
FilterPattern: "\"Max wait queue limit of 256 reached, not scheduling.\""
LogGroupName: !Sub members-data-api-${Stage}
MetricTransformations:
- MetricValue: 1
DefaultValue: 0
MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataHttpQueuesFullMetricFilterName ]
MembersDataHttpQueuesFullAlarm:
Type: AWS::CloudWatch::Alarm
DependsOn:
- MembersDataHttpQueuesFullMetricFilter
- MembersDataApiLogGroup
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName: !Join
- ' '
- - !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- 'Http Client Queue is full'
AlarmDescription: !Join
- ' '
- - "Impact - basically app is not responding (no longer serving traffic)"
- !FindInMap [ Constants, Alarm, Process ]
EvaluationPeriods: 1
Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ]
MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataHttpQueuesFullMetricFilterName ]
Period: 3600
Statistic: Sum
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 20
TreatMissingData: notBreaching
4XXRatioAlarm:
Type: 'AWS::CloudWatch::Alarm'
Condition: Create4XXRatioAlarm
Properties:
AlarmName: !Join
- ' '
- - !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- '4XX Ratio has exceeded 20%'
AlarmDescription: !Join
- ' '
- - "Impact - we're serving 4XX for significant proportion of requests - indicative of an issue interacting with identity"
- !FindInMap [ Constants, Alarm, Process ]
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
TreatMissingData: notBreaching
EvaluationPeriods: '3'
Threshold: 20
ComparisonOperator: GreaterThanThreshold
Metrics:
- Id: ratio4XX
Expression: ( http4XX / httpTotalRequests) * 100
Label: "4XX Ratio"
- Id: http4XX
MetricStat:
Metric:
Namespace: AWS/ELB
MetricName: HTTPCode_Backend_4XX
Dimensions:
- Name: LoadBalancerName
Value: !Ref LoadBalancer
Period: 60
Stat: Sum
Unit: Count
ReturnData: false
- Id: httpTotalRequests
MetricStat:
Metric:
Namespace: AWS/ELB
MetricName: RequestCount
Dimensions:
- Name: LoadBalancerName
Value: !Ref LoadBalancer
Period: 60
Stat: Sum
Unit: Count
ReturnData: false
NoHealthyInstancesAlarm:
Type: AWS::CloudWatch::Alarm
Condition: CreateProdMonitoring
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName: !Join
- ' '
- - !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- 'No healthy instances'
AlarmDescription: !Join
- ' '
- - "Impact - members-data-api is DOWN"
- !FindInMap [ Constants, Alarm, Process ]
MetricName: HealthyHostCount
Namespace: AWS/ELB
Dimensions:
- Name: LoadBalancerName
Value: !Ref LoadBalancer
ComparisonOperator: LessThanOrEqualToThreshold
Threshold: 0.5
Period: 60
EvaluationPeriods: 10
Statistic: Average
DependsOn:
- LoadBalancer
High5XXRateAlarm:
Type: AWS::CloudWatch::Alarm
Condition: CreateProdMonitoring
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName: !Join
- ' '
- - !FindInMap [ StageVariables, !Ref Stage, Urgent ]
- !Ref 'Stage'
- !FindInMap [ Constants , MetricFilters , MetricNamespace ]
- 'High 5XX rate'
AlarmDescription: !Join
- ' '
- - "Impact - we're serving errors to too many people, often a Zuora issue, but could be very serious"
- !FindInMap [ Constants, Alarm, Process ]
ComparisonOperator: GreaterThanThreshold
Threshold: 50
EvaluationPeriods: 4
DatapointsToAlarm: 2
TreatMissingData: notBreaching
Metrics:
- Id: total5XX
Expression: backend5XX + elb5XX
Label: "Count of Backend AND ELB 5XX"
- Id: backend5XX
MetricStat:
Metric:
Namespace: AWS/ELB
MetricName: HTTPCode_Backend_5XX
Dimensions:
- Name: LoadBalancerName
Value: !Ref LoadBalancer
Period: 300
Stat: Sum
Unit: Count
ReturnData: false
- Id: elb5XX
MetricStat:
Metric:
Namespace: AWS/ELB
MetricName: HTTPCode_ELB_5XX
Dimensions:
- Name: LoadBalancerName
Value: !Ref LoadBalancer
Period: 300 # ELB sample rate appears to not work with a Period of 60 (i.e. 1min)
Stat: Sum
Unit: Count
ReturnData: false
DependsOn:
- LoadBalancer
SupporterProductDataDynamoErrorAlarm:
Type: AWS::CloudWatch::Alarm
Condition: CreateProdMonitoring
Properties:
AlarmActions:
- !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage}
AlarmName: !Join
- ' '
- - 'A DynamoReadError occurred while reading from the SupporterProductData DynamoDB table'
- !Ref 'Stage'
AlarmDescription: !Join
- ' '
- - 'There was at least one DynamoReadError returned while fetching supporter rate plan items from the SupporterProductData DynamoDB table, check logs for details.'
MetricName: SupporterProductDataDynamoError
Namespace: members-data-api
Dimensions:
- Name: Services
Value: SupporterProductDataService
- Name: Stage
Value: !Sub ${Stage}
ComparisonOperator: GreaterThanOrEqualToThreshold
Threshold: 1
Period: 60
EvaluationPeriods: 1
Statistic: Sum
Outputs:
LoadBalancerUrl:
Value:
Fn::GetAtt:
- LoadBalancer
- DNSName