def linux_auto_stop_instance()

in source/soca/cluster_web_ui/scheduled_tasks/manage_dcv_instances_lifecycle.py [0:0]


def linux_auto_stop_instance(instances_to_check):
    # Automatically stop or hibernate (when possible) instances based on Idle time and CPU usage
    with db.app.app_context():
        logger.info(f"Scheduled Task: linux_auto_stop_instance {instances_to_check}")
        get_host_to_stop = retrieve_host(instances_to_check, "running")
        logger.info("linux_auto_stop_instance: List of Linux DCV hosts subject to stop/hibernate {}".format(get_host_to_stop))
        for instance_id, instance_data in get_host_to_stop.items():
            if instance_data["hibernate_enabled"] is True:
                action = "hibernate"
                stop_instance_after = config.Config.DCV_LINUX_HIBERNATE_IDLE_SESSION
            else:
                action = "stop"
                stop_instance_after = config.Config.DCV_LINUX_STOP_IDLE_SESSION

            logger.info("linux_auto_stop_instance: Trying to {} instance {} if idle for more than {} hours and  CPU % is below {}".format(action, instance_id, stop_instance_after, config.Config.DCV_IDLE_CPU_THRESHOLD))
            if stop_instance_after > 0:
                for instance_id in get_host_to_stop.keys():
                    logger.info("Checking Instance ID: {}".format(instance_id))
                    ssm_failed = False
                    ssm_list_command_loop = 0
                    shell_commands = [
                        "DCV_Describe_Session=$(dcv describe-session " + str(instance_data["session_uuid"]) + " -j)",
                        "CPUAveragePerformanceLast10Secs=$(top -d 5 -b -n2 | grep 'Cpu(s)' |tail -n 1 | awk '{print $2 + $4}')",
                        "echo '{\"DCV\": '"'$DCV_Describe_Session'"' , \"CPUAveragePerformanceLast10Secs\": '"'$CPUAveragePerformanceLast10Secs'"'}'"]

                    try:
                        check_dcv_session = client_ssm.send_command(InstanceIds=[instance_id],
                                                                    DocumentName='AWS-RunShellScript',
                                                                    Parameters={"commands": shell_commands},
                                                                    TimeoutSeconds=30)
                    except ClientError as e:
                        logger.error("Unable to query SSM for {} : {}".format(instance_id, e))
                        if "InvalidInstanceId" in str(e):
                            logger.error("linux_auto_stop_instance: Instance is not in Running state or SSM daemon is not running. This instance is probably still starting up ...")
                        ssm_failed = True

                    if ssm_failed is False:
                        ssm_command_id = check_dcv_session["Command"]["CommandId"]
                        while ssm_list_command_loop < 6:
                            check_command_status = client_ssm.list_commands(CommandId=ssm_command_id)['Commands'][0]['Status']
                            if check_command_status != "Success":
                                logger.info("linux_auto_stop_instance: SSM command ({}) executed but did not succeed or failed yet. Waiting 20 seconds ... {} ".format(ssm_command_id, client_ssm.list_commands(CommandId=ssm_command_id)['Commands']))
                                if check_command_status == "Failed":
                                    logger.error("linux_auto_stop_instance: Unable to query DCV for {} with SSM id ".format(instance_id, ssm_command_id))
                                    ssm_failed = True
                                    break
                                time.sleep(20)
                                ssm_list_command_loop += 1
                            else:
                                break

                    if ssm_list_command_loop >= 5:
                        logger.error("linux_auto_stop_instance: Unable to determine status SSM responses after 2 minutes timeout for {} : {} ".format(ssm_command_id, str(client_ssm.list_commands(CommandId=ssm_command_id))))
                        ssm_failed = True

                    if ssm_failed is False:
                        ssm_output = client_ssm.get_command_invocation(CommandId=ssm_command_id, InstanceId=instance_id)
                        session_info = json.loads(ssm_output["StandardOutputContent"])
                        session_current_connection = session_info["DCV"]["num-of-connections"]
                        if session_info["DCV"]["last-disconnection-time"] == "":
                            # handle case where user launched DCV but never accessed it
                            last_dcv_disconnect = parse(session_info["DCV"]["creation-time"])
                        else:
                            last_dcv_disconnect = parse(session_info["DCV"]["last-disconnection-time"])

                        logger.info(session_info)
                        session_cpu_average = session_info["CPUAveragePerformanceLast10Secs"]
                        if session_cpu_average < config.Config.DCV_IDLE_CPU_THRESHOLD:
                            if session_current_connection == 0:
                                current_time = parse(datetime.now().replace(microsecond=0).replace(tzinfo=timezone.utc).isoformat())
                                if (last_dcv_disconnect + timedelta(hours=stop_instance_after)) < current_time:
                                    logger.info("linux_auto_stop_instance: {} is ready for {}. Last access time {}".format(instance_id,action, last_dcv_disconnect))
                                    try:
                                        if action == "hibernate":
                                            client_ec2.stop_instances(InstanceIds=[instance_id], Hibernate=True, DryRun=True)
                                        else:
                                            client_ec2.stop_instances(InstanceIds=[instance_id], DryRun=True)
                                    except ClientError as e:
                                        if e.response['Error'].get('Code') == 'DryRunOperation':
                                            if action == "hibernate":
                                                client_ec2.stop_instances(InstanceIds=[instance_id], Hibernate=True)
                                            else:
                                                client_ec2.stop_instances(InstanceIds=[instance_id])

                                            logging.info("linux_auto_stop_instance: Stopped {}".format(instance_id))
                                            try:
                                                check_session = LinuxDCVSessions.query.filter_by(session_instance_id=instance_id,
                                                                                                 session_state="running",
                                                                                                 is_active=True).first()
                                                if check_session:
                                                    check_session.session_state = "stopped"
                                                    db.session.commit()
                                                    logger.info("linux_auto_stop_instance: DB entry updated")
                                                else:
                                                    logger.error("linux_auto_stop_instance: Instance ({}) has been stopped but could not find associated database entry".format(instance_id), "error")
                                            except Exception as e:
                                                logger.error("linux_auto_stop_instance: SQL Query error:".format(e), "error")
                                        else:
                                            logger.error("linux_auto_stop_instance: Unable to {} instance ({}) due to {}".format(action, instance_id, e), "error")
                                else:
                                    logger.info("linux_auto_stop_instance: {} NOT ready for {}. Last access time {}".format(instance_id, action, last_dcv_disconnect))
                            else:
                                logger.info("linux_auto_stop_instance: {} currently has active DCV sessions")
                        else:
                            logger.info("linux_auto_stop_instance: CPU usage {} is above threshold {} so this host won't be subject to {}.".format(session_cpu_average, config.Config.DCV_IDLE_CPU_THRESHOLD, action))
                    else:
                        logger.error("linux_auto_stop_instance: SSM failed for {} with ssm_id {}".format(instance_id, ssm_command_id))