cloudformation/membership-attribute-service.yaml (552 lines of code) (raw):

AWSTemplateFormatVersion: '2010-09-09' Description: Membership Attributes service Parameters: Stage: Description: Environment name Type: String Default: PROD InstanceType: Description: EC2 instance type Type: String Default: t4g.small AllowedValues: - t4g.small ConstraintDescription: must be a valid EC2 instance type. VpcId: Description: ID of the VPC onto which to launch the application Type: AWS::EC2::VPC::Id Default: vpc-e6e00183 PrivateVpcSubnets: Description: Private subnets to use for EC2 instances Type: List<AWS::EC2::Subnet::Id> PublicVpcSubnets: Description: Public subnets to use for the ELB Type: List<AWS::EC2::Subnet::Id> AmiId: Description: Custom AMI to use for instances Type: String ELBSSLCertificate: Description: ELB SSL Certificate ARN Type: String SecurityGroupForPostgres: Description: Security group for querying the postgres database Type: String Mappings: Constants: Alarm: Process: Follow the process in https://docs.google.com/document/d/1_3El3cly9d7u_jPgTcRjLxmdG2e919zCLvmcFCLOYAk/edit MetricFilters: MetricNamespace: "members-data-api" StageVariables: PROD: MaxInstances: 12 # This should be (at least) double the desired capacity. MinInstances: 3 NotificationAlarmPeriod: 1200 InstanceName: PROD:membership-attribute-service DynamoDBSupporterProductDataTables: - arn:aws:dynamodb:*:*:table/SupporterProductData-PROD - arn:aws:dynamodb:*:*:table/SupporterProductData-CODE ReadableS3Resources: - arn:aws:s3:::gu-membership-attribute-service-dist/membership/PROD/* - arn:aws:s3:::gu-reader-revenue-private/membership/members-data-api/PROD/* MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName: "PROD-default-payment-method-set-to-nothing" # can't use !Sub for Stage inside a mapping definition, sigh MembersDataUnsupportedProductRatePlanIdMetricFilterName: "PROD-unsupported-product-rate-plan-id-encountered" # can't use !Sub for Stage inside a mapping definition, sigh MembersDataHttpQueuesFullMetricFilterName: "PROD-http-client-queue-full" Urgent: "URGENT 9-5 -" CODE: MaxInstances: 2 MinInstances: 1 NotificationAlarmPeriod: 1200 InstanceName: CODE:membership-attribute-service DynamoDBSupporterProductDataTables: - arn:aws:dynamodb:*:*:table/SupporterProductData-CODE ReadableS3Resources: - arn:aws:s3:::gu-membership-attribute-service-dist/membership/CODE/* - arn:aws:s3:::gu-reader-revenue-private/membership/members-data-api/CODE/* MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName: "CODE-default-payment-method-set-to-nothing" MembersDataUnsupportedProductRatePlanIdMetricFilterName: "CODE-unsupported-product-rate-plan-id-encountered" MembersDataHttpQueuesFullMetricFilterName: "CODE-http-client-queue-full" Urgent: "Warning" Conditions: CreateProdMonitoring: !Equals [ !Ref Stage, PROD ] CreateUnsupportedProductRatePlanAlarm: !Equals [ 1, 2 ] # Disabling the alarm until we can ensure promotions are not longer put in Dymamo (which will trigger the alarm) Create4XXRatioAlarm: !Equals [ 1, 2 ] # disabled while we have a high rate of 401s for android app requests Resources: MembershipRole: Type: AWS::IAM::Role Properties: RoleName: !Sub members-data-api-${Stage} AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: - ec2.amazonaws.com Action: - sts:AssumeRole Path: "/" Policies: - PolicyName: root PolicyDocument: Statement: # Explicitly deny access to all S3 resources except for those defined in ReadableS3Resources # https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic.html#policy-eval-denyallow - Effect: Deny Action: s3:* NotResource: !FindInMap [StageVariables, !Ref Stage, ReadableS3Resources] - Effect: Allow Action: s3:GetObject Resource: !FindInMap [StageVariables, !Ref Stage, ReadableS3Resources] - Action: - dynamodb:Query - dynamodb:DescribeTable Resource: Fn::FindInMap: [ StageVariables, { Ref: Stage }, DynamoDBSupporterProductDataTables ] Effect: Allow - Action: - cloudwatch:* Resource: "*" Effect: Allow - Action: - logs:CreateLogGroup - logs:CreateLogStream - logs:PutLogEvents Resource: !GetAtt MembersDataApiLogGroup.Arn Effect: Allow - Action: - sqs:GetQueueUrl - sqs:SendMessage Resource: Fn::ImportValue: !Sub "comms-${Stage}-EmailQueueArn" Effect: Allow ManagedPolicyArns: - !Sub arn:aws:iam::${AWS::AccountId}:policy/guardian-ec2-role-for-ssm SSMRunCommandPolicy: Type: AWS::IAM::Policy Properties: PolicyName: ssm-run-command-policy PolicyDocument: Statement: # minimal policy to allow running commands via ssm - Effect: Allow Resource: "*" Action: - ec2messages:AcknowledgeMessage - ec2messages:DeleteMessage - ec2messages:FailMessage - ec2messages:GetEndpoint - ec2messages:GetMessages - ec2messages:SendReply - ssm:UpdateInstanceInformation - ssm:ListInstanceAssociations - ssm:DescribeInstanceProperties - ssm:DescribeDocumentParameters - ssmmessages:CreateControlChannel - ssmmessages:CreateDataChannel - ssmmessages:OpenControlChannel - ssmmessages:OpenDataChannel Roles: - !Ref MembershipRole InstanceProfile: Type: AWS::IAM::InstanceProfile Properties: Path: "/" Roles: - Ref: MembershipRole LoadBalancer: Type: AWS::ElasticLoadBalancing::LoadBalancer Properties: Listeners: - LoadBalancerPort: '443' InstancePort: '9000' Protocol: HTTPS SSLCertificateId: !Ref ELBSSLCertificate ConnectionDrainingPolicy: Enabled: 'true' Timeout: '60' CrossZone: 'true' HealthCheck: Target: HTTP:9000/healthcheck HealthyThreshold: '2' UnhealthyThreshold: '3' Interval: '10' Timeout: '5' Subnets: Ref: PublicVpcSubnets SecurityGroups: - Ref: LoadBalancerSecurityGroup AutoscalingGroup: Type: AWS::AutoScaling::AutoScalingGroup Properties: AutoScalingGroupName: !Sub members-data-api-${Stage} LaunchConfigurationName: Ref: LaunchConfig MinSize: !FindInMap [ StageVariables, !Ref Stage, MinInstances ] MaxSize: !FindInMap [ StageVariables, !Ref Stage, MaxInstances ] HealthCheckType: ELB HealthCheckGracePeriod: 400 LoadBalancerNames: - Ref: LoadBalancer Tags: - Key: Stage Value: Ref: Stage PropagateAtLaunch: 'true' - Key: Name Value: Fn::FindInMap: [ StageVariables, { Ref: Stage }, InstanceName ] PropagateAtLaunch: 'true' - Key: Stack Value: membership PropagateAtLaunch: 'true' - Key: App Value: membership-attribute-service PropagateAtLaunch: 'true' - Key: Role Value: membership-attribute-service PropagateAtLaunch: 'true' - Key: Mainclass Value: membership-attribute-service PropagateAtLaunch: 'true' VPCZoneIdentifier: Ref: PrivateVpcSubnets LaunchConfig: Type: AWS::AutoScaling::LaunchConfiguration Properties: ImageId: Ref: AmiId SecurityGroups: - Ref: InstanceSecurityGroup - Ref: SecurityGroupForPostgres InstanceType: Ref: InstanceType MetadataOptions: HttpTokens: required AssociatePublicIpAddress: 'False' IamInstanceProfile: Ref: InstanceProfile UserData: "Fn::Base64": !Sub - | #!/bin/bash -ev CONF_DIR=/etc/membership-attribute-service aws s3 cp s3://gu-membership-attribute-service-dist/membership/${Stage}/membership-attribute-service/membership-attribute-service_1.0-SNAPSHOT_all.deb /tmp dpkg -i /tmp/membership-attribute-service_1.0-SNAPSHOT_all.deb mkdir -p /etc/gu aws s3 cp s3://gu-reader-revenue-private/membership/members-data-api/${Stage}/members-data-api.private.conf /etc/gu chown membership-attribute-service /etc/gu/members-data-api.private.conf chmod 0600 /etc/gu/members-data-api.private.conf /opt/cloudwatch-logs/configure-logs application members ${Stage} data-api /var/log/membership-attribute-service/membership-attribute-service.log; - {} LoadBalancerSecurityGroup: Type: AWS::EC2::SecurityGroup Properties: VpcId: Ref: VpcId GroupDescription: Open up HTTPS access to load balancer SecurityGroupIngress: - IpProtocol: tcp FromPort: '443' ToPort: '443' CidrIp: 0.0.0.0/0 SecurityGroupEgress: - IpProtocol: tcp FromPort: '9000' ToPort: '9000' CidrIp: 0.0.0.0/0 InstanceSecurityGroup: Type: AWS::EC2::SecurityGroup Properties: VpcId: Ref: VpcId GroupDescription: Open up HTTP access to load balancer SecurityGroupIngress: - IpProtocol: tcp FromPort: '9000' ToPort: '9000' SourceSecurityGroupId: Ref: LoadBalancerSecurityGroup SecurityGroupEgress: - IpProtocol: tcp FromPort: '443' ToPort: '443' CidrIp: 0.0.0.0/0 MembersDataApiLogGroup: Type: "AWS::Logs::LogGroup" Properties: LogGroupName: !Sub members-data-api-${Stage} RetentionInDays: 14 MembersDataUnsupportedProductRatePlanIdMetricFilter: Type: AWS::Logs::MetricFilter DependsOn: MembersDataApiLogGroup Properties: FilterName: !Sub members-data-api-${Stage}-unsupported-product-rate-plan-id-metric-filter FilterPattern: "\"Unsupported product rate plan id\"" LogGroupName: !Sub members-data-api-${Stage} MetricTransformations: - MetricValue: 1 DefaultValue: 0 MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataUnsupportedProductRatePlanIdMetricFilterName ] MembersDataUnsupportedProductRatePlanIdAlarm: Type: AWS::CloudWatch::Alarm Condition: CreateUnsupportedProductRatePlanAlarm DependsOn: - MembersDataUnsupportedProductRatePlanIdMetricFilter - MembersDataApiLogGroup Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - 'Unsupported product rate plan id' AlarmDescription: !Join - ' ' - - "Impact - the /user-attributes/me output might be missing an attribute that should be set because a product rate plan id isn't supported." - !FindInMap [ Constants, Alarm, Process ] EvaluationPeriods: 1 Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataUnsupportedProductRatePlanIdMetricFilterName ] Period: 600 Statistic: Sum ComparisonOperator: GreaterThanOrEqualToThreshold Threshold: 1 TreatMissingData: notBreaching MembersDataDefaultPaymentMethodLeftEmptyMetricFilter: Type: AWS::Logs::MetricFilter DependsOn: MembersDataApiLogGroup Properties: FilterName: !Sub members-data-api-${Stage}-default-payment-method-empty-metric-filter FilterPattern: "\"default-payment-method-lost\"" LogGroupName: !Sub members-data-api-${Stage} MetricTransformations: - MetricValue: 1 DefaultValue: 0 MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName ] MembersDataDefaultPaymentMethodLeftEmptyAlarm: Type: AWS::CloudWatch::Alarm DependsOn: - MembersDataDefaultPaymentMethodLeftEmptyMetricFilter - MembersDataApiLogGroup Condition: CreateProdMonitoring Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - 'Default Payment Method set to nothing' AlarmDescription: !Join - ' ' - - "Impact - a user has been left with no Default Payment method, so we can't take payment from them indefinitely" - !FindInMap [ Constants, Alarm, Process ] EvaluationPeriods: 1 Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataDefaultPaymentMethodLeftEmptyMetricFilterName ] Period: 3600 Statistic: Sum ComparisonOperator: GreaterThanOrEqualToThreshold Threshold: 1 TreatMissingData: notBreaching MembersDataHttpQueuesFullMetricFilter: Type: AWS::Logs::MetricFilter DependsOn: MembersDataApiLogGroup Properties: FilterName: !Sub members-data-api-${Stage}-http-queues-full-metric-filter FilterPattern: "\"Max wait queue limit of 256 reached, not scheduling.\"" LogGroupName: !Sub members-data-api-${Stage} MetricTransformations: - MetricValue: 1 DefaultValue: 0 MetricNamespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataHttpQueuesFullMetricFilterName ] MembersDataHttpQueuesFullAlarm: Type: AWS::CloudWatch::Alarm DependsOn: - MembersDataHttpQueuesFullMetricFilter - MembersDataApiLogGroup Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - 'Http Client Queue is full' AlarmDescription: !Join - ' ' - - "Impact - basically app is not responding (no longer serving traffic)" - !FindInMap [ Constants, Alarm, Process ] EvaluationPeriods: 1 Namespace: !FindInMap [ Constants , MetricFilters , MetricNamespace ] MetricName: !FindInMap [ StageVariables , !Ref Stage , MembersDataHttpQueuesFullMetricFilterName ] Period: 3600 Statistic: Sum ComparisonOperator: GreaterThanOrEqualToThreshold Threshold: 20 TreatMissingData: notBreaching 4XXRatioAlarm: Type: 'AWS::CloudWatch::Alarm' Condition: Create4XXRatioAlarm Properties: AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - '4XX Ratio has exceeded 20%' AlarmDescription: !Join - ' ' - - "Impact - we're serving 4XX for significant proportion of requests - indicative of an issue interacting with identity" - !FindInMap [ Constants, Alarm, Process ] AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} TreatMissingData: notBreaching EvaluationPeriods: '3' Threshold: 20 ComparisonOperator: GreaterThanThreshold Metrics: - Id: ratio4XX Expression: ( http4XX / httpTotalRequests) * 100 Label: "4XX Ratio" - Id: http4XX MetricStat: Metric: Namespace: AWS/ELB MetricName: HTTPCode_Backend_4XX Dimensions: - Name: LoadBalancerName Value: !Ref LoadBalancer Period: 60 Stat: Sum Unit: Count ReturnData: false - Id: httpTotalRequests MetricStat: Metric: Namespace: AWS/ELB MetricName: RequestCount Dimensions: - Name: LoadBalancerName Value: !Ref LoadBalancer Period: 60 Stat: Sum Unit: Count ReturnData: false NoHealthyInstancesAlarm: Type: AWS::CloudWatch::Alarm Condition: CreateProdMonitoring Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - 'No healthy instances' AlarmDescription: !Join - ' ' - - "Impact - members-data-api is DOWN" - !FindInMap [ Constants, Alarm, Process ] MetricName: HealthyHostCount Namespace: AWS/ELB Dimensions: - Name: LoadBalancerName Value: !Ref LoadBalancer ComparisonOperator: LessThanOrEqualToThreshold Threshold: 0.5 Period: 60 EvaluationPeriods: 10 Statistic: Average DependsOn: - LoadBalancer High5XXRateAlarm: Type: AWS::CloudWatch::Alarm Condition: CreateProdMonitoring Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - !FindInMap [ StageVariables, !Ref Stage, Urgent ] - !Ref 'Stage' - !FindInMap [ Constants , MetricFilters , MetricNamespace ] - 'High 5XX rate' AlarmDescription: !Join - ' ' - - "Impact - we're serving errors to too many people, often a Zuora issue, but could be very serious" - !FindInMap [ Constants, Alarm, Process ] ComparisonOperator: GreaterThanThreshold Threshold: 50 EvaluationPeriods: 4 DatapointsToAlarm: 2 TreatMissingData: notBreaching Metrics: - Id: total5XX Expression: backend5XX + elb5XX Label: "Count of Backend AND ELB 5XX" - Id: backend5XX MetricStat: Metric: Namespace: AWS/ELB MetricName: HTTPCode_Backend_5XX Dimensions: - Name: LoadBalancerName Value: !Ref LoadBalancer Period: 300 Stat: Sum Unit: Count ReturnData: false - Id: elb5XX MetricStat: Metric: Namespace: AWS/ELB MetricName: HTTPCode_ELB_5XX Dimensions: - Name: LoadBalancerName Value: !Ref LoadBalancer Period: 300 # ELB sample rate appears to not work with a Period of 60 (i.e. 1min) Stat: Sum Unit: Count ReturnData: false DependsOn: - LoadBalancer SupporterProductDataDynamoErrorAlarm: Type: AWS::CloudWatch::Alarm Condition: CreateProdMonitoring Properties: AlarmActions: - !Sub arn:aws:sns:${AWS::Region}:${AWS::AccountId}:alarms-handler-topic-${Stage} AlarmName: !Join - ' ' - - 'A DynamoReadError occurred while reading from the SupporterProductData DynamoDB table' - !Ref 'Stage' AlarmDescription: !Join - ' ' - - 'There was at least one DynamoReadError returned while fetching supporter rate plan items from the SupporterProductData DynamoDB table, check logs for details.' MetricName: SupporterProductDataDynamoError Namespace: members-data-api Dimensions: - Name: Services Value: SupporterProductDataService - Name: Stage Value: !Sub ${Stage} ComparisonOperator: GreaterThanOrEqualToThreshold Threshold: 1 Period: 60 EvaluationPeriods: 1 Statistic: Sum Outputs: LoadBalancerUrl: Value: Fn::GetAtt: - LoadBalancer - DNSName