in source/soca/cluster_web_ui/scheduled_tasks/manage_dcv_instances_lifecycle.py [0:0]
def linux_auto_stop_instance(instances_to_check):
# Automatically stop or hibernate (when possible) instances based on Idle time and CPU usage
with db.app.app_context():
logger.info(f"Scheduled Task: linux_auto_stop_instance {instances_to_check}")
get_host_to_stop = retrieve_host(instances_to_check, "running")
logger.info("linux_auto_stop_instance: List of Linux DCV hosts subject to stop/hibernate {}".format(get_host_to_stop))
for instance_id, instance_data in get_host_to_stop.items():
if instance_data["hibernate_enabled"] is True:
action = "hibernate"
stop_instance_after = config.Config.DCV_LINUX_HIBERNATE_IDLE_SESSION
else:
action = "stop"
stop_instance_after = config.Config.DCV_LINUX_STOP_IDLE_SESSION
logger.info("linux_auto_stop_instance: Trying to {} instance {} if idle for more than {} hours and CPU % is below {}".format(action, instance_id, stop_instance_after, config.Config.DCV_IDLE_CPU_THRESHOLD))
if stop_instance_after > 0:
for instance_id in get_host_to_stop.keys():
logger.info("Checking Instance ID: {}".format(instance_id))
ssm_failed = False
ssm_list_command_loop = 0
shell_commands = [
"DCV_Describe_Session=$(dcv describe-session " + str(instance_data["session_uuid"]) + " -j)",
"CPUAveragePerformanceLast10Secs=$(top -d 5 -b -n2 | grep 'Cpu(s)' |tail -n 1 | awk '{print $2 + $4}')",
"echo '{\"DCV\": '"'$DCV_Describe_Session'"' , \"CPUAveragePerformanceLast10Secs\": '"'$CPUAveragePerformanceLast10Secs'"'}'"]
try:
check_dcv_session = client_ssm.send_command(InstanceIds=[instance_id],
DocumentName='AWS-RunShellScript',
Parameters={"commands": shell_commands},
TimeoutSeconds=30)
except ClientError as e:
logger.error("Unable to query SSM for {} : {}".format(instance_id, e))
if "InvalidInstanceId" in str(e):
logger.error("linux_auto_stop_instance: Instance is not in Running state or SSM daemon is not running. This instance is probably still starting up ...")
ssm_failed = True
if ssm_failed is False:
ssm_command_id = check_dcv_session["Command"]["CommandId"]
while ssm_list_command_loop < 6:
check_command_status = client_ssm.list_commands(CommandId=ssm_command_id)['Commands'][0]['Status']
if check_command_status != "Success":
logger.info("linux_auto_stop_instance: SSM command ({}) executed but did not succeed or failed yet. Waiting 20 seconds ... {} ".format(ssm_command_id, client_ssm.list_commands(CommandId=ssm_command_id)['Commands']))
if check_command_status == "Failed":
logger.error("linux_auto_stop_instance: Unable to query DCV for {} with SSM id ".format(instance_id, ssm_command_id))
ssm_failed = True
break
time.sleep(20)
ssm_list_command_loop += 1
else:
break
if ssm_list_command_loop >= 5:
logger.error("linux_auto_stop_instance: Unable to determine status SSM responses after 2 minutes timeout for {} : {} ".format(ssm_command_id, str(client_ssm.list_commands(CommandId=ssm_command_id))))
ssm_failed = True
if ssm_failed is False:
ssm_output = client_ssm.get_command_invocation(CommandId=ssm_command_id, InstanceId=instance_id)
session_info = json.loads(ssm_output["StandardOutputContent"])
session_current_connection = session_info["DCV"]["num-of-connections"]
if session_info["DCV"]["last-disconnection-time"] == "":
# handle case where user launched DCV but never accessed it
last_dcv_disconnect = parse(session_info["DCV"]["creation-time"])
else:
last_dcv_disconnect = parse(session_info["DCV"]["last-disconnection-time"])
logger.info(session_info)
session_cpu_average = session_info["CPUAveragePerformanceLast10Secs"]
if session_cpu_average < config.Config.DCV_IDLE_CPU_THRESHOLD:
if session_current_connection == 0:
current_time = parse(datetime.now().replace(microsecond=0).replace(tzinfo=timezone.utc).isoformat())
if (last_dcv_disconnect + timedelta(hours=stop_instance_after)) < current_time:
logger.info("linux_auto_stop_instance: {} is ready for {}. Last access time {}".format(instance_id,action, last_dcv_disconnect))
try:
if action == "hibernate":
client_ec2.stop_instances(InstanceIds=[instance_id], Hibernate=True, DryRun=True)
else:
client_ec2.stop_instances(InstanceIds=[instance_id], DryRun=True)
except ClientError as e:
if e.response['Error'].get('Code') == 'DryRunOperation':
if action == "hibernate":
client_ec2.stop_instances(InstanceIds=[instance_id], Hibernate=True)
else:
client_ec2.stop_instances(InstanceIds=[instance_id])
logging.info("linux_auto_stop_instance: Stopped {}".format(instance_id))
try:
check_session = LinuxDCVSessions.query.filter_by(session_instance_id=instance_id,
session_state="running",
is_active=True).first()
if check_session:
check_session.session_state = "stopped"
db.session.commit()
logger.info("linux_auto_stop_instance: DB entry updated")
else:
logger.error("linux_auto_stop_instance: Instance ({}) has been stopped but could not find associated database entry".format(instance_id), "error")
except Exception as e:
logger.error("linux_auto_stop_instance: SQL Query error:".format(e), "error")
else:
logger.error("linux_auto_stop_instance: Unable to {} instance ({}) due to {}".format(action, instance_id, e), "error")
else:
logger.info("linux_auto_stop_instance: {} NOT ready for {}. Last access time {}".format(instance_id, action, last_dcv_disconnect))
else:
logger.info("linux_auto_stop_instance: {} currently has active DCV sessions")
else:
logger.info("linux_auto_stop_instance: CPU usage {} is above threshold {} so this host won't be subject to {}.".format(session_cpu_average, config.Config.DCV_IDLE_CPU_THRESHOLD, action))
else:
logger.error("linux_auto_stop_instance: SSM failed for {} with ssm_id {}".format(instance_id, ssm_command_id))