netbench-cdk/netbench-monitor/handler.py (55 lines of code) (raw):
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
import boto3
import logging
from datetime import datetime, timezone
from os import getenv
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Max instance lifetime in seconds defaulting to one day.
# Over-rideable via an environment variable.
MAX_LIFETIME: int = int(getenv("MAX_LIFETIME", 86400))
def get_ebs_age(date_obj: datetime|str) -> int:
"""
Convert a date object or str to age in seconds
"""
# String to Date object conversion is done automajically with boto3
# but not with a standard json.load() from tests
if type(date_obj) == str:
date_format = '%Y-%m-%dT%H:%M:%S%z'
try:
date_obj = datetime.strptime(date_obj, date_format)
except ValueError as e:
print(f"The date format was unexpected: {e} ")
raise
now = datetime.now(tz=timezone.utc)
delta = now - date_obj
if delta.total_seconds() < 1:
raise ValueError(f"Date is in the future:{delta}")
else:
return int(delta.total_seconds())
def lambda_handler(event, context):
"""
Use the ec2 describe-instances call to determine the age
of all running instances. Emit weather an alarm is true,
and a list of instances above and below threshold.
"""
ec2_client = boto3.client('ec2')
response = ec2_client.describe_instances(
Filters=[
{'Name': 'instance-state-name',
'Values': [ "running"]
}
])
# TODO: feat: do the instance cleanup, now.
#if SOME_SAFETY_CONDITION:
# for instance in response.instance_above_max:
# terminate_instance(instance)
return process_describe_instances(response)
def process_describe_instances(response: dict) -> dict:
# Walk the running instance list checking the age of the disk mount
# against MAX_LIFETIME
instance_above_max: dict[str, int] = {}
instance_below_max: dict[str, int] = {}
alarm: bool = False
for group in response['Reservations']:
instance = group['Instances'][0]
# If this is missing, skip this instance
# this should never happen in the running state.
if 'BlockDeviceMappings' not in instance:
raise ValueError("Missing expected field BlockDeviceMapping; "
+ "running instances should have at least one "
+ "block device attached.")
if len(instance['BlockDeviceMappings']) > 0:
age = get_ebs_age(instance['BlockDeviceMappings'][0]['Ebs']['AttachTime'])
if age > MAX_LIFETIME:
alarm = True
instance_above_max[instance['InstanceId']] = age
else:
instance_below_max[instance['InstanceId']] = age
else:
continue
return {"alarm_threshold": MAX_LIFETIME,
"overall_alarm": alarm,
"instances_below_max": instance_below_max,
"instances_above_max": instance_above_max }
def terminate_instance(instance_id: str) -> int:
raise NotImplemented