blogs/eks-canary-deployments-pipeline/shared_stack/lambda_functions/gather_healthcheck_status/main.py (40 lines of code) (raw):
import boto3
import datetime
cw = boto3.client('cloudwatch')
def lambda_handler(event, context):
microservice_name = event.get('microservice_name')
failure_threshold_value = event.get('failure_threshold_value')
if not failure_threshold_value:
failure_threshold_value = 0
failure_threshold_time = event.get('failure_threshold_time')
if not failure_threshold_time:
failure_threshold_time = 600
return get_healthcheck_status(microservice_name, failure_threshold_value, failure_threshold_time)
def get_healthcheck_status(microservice_name, failure_threshold_value, failure_threshold_time):
now = datetime.datetime.now()
ts_now = now.timestamp()
ts_start = (now - datetime.timedelta(seconds=failure_threshold_time)).timestamp()
response = cw.get_metric_data(
MetricDataQueries=[
{
'Id': 'id1',
'MetricStat': {
'Metric': {
'Namespace': 'EnvoyPrometheus/ResponseCode',
'MetricName': '5xx-%s' % microservice_name
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
},
],
StartTime=ts_start,
EndTime=ts_now,
ScanBy='TimestampDescending'
)
values = response.get('MetricDataResults')[0].get('Values')
if values and sum(values) > failure_threshold_value:
print('Found [%s] 5XX HTTP response code' % sum(values))
return False
return True