azurelinuxagent/agent.py (311 lines of code) (raw):

# Microsoft Azure Linux Agent # # Copyright 2018 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ # """ Module agent """ from __future__ import print_function import json import os import re import subprocess import sys import threading from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.ga.cgroupcontroller import AGENT_LOG_COLLECTOR from azurelinuxagent.ga.cpucontroller import _CpuController from azurelinuxagent.ga.cgroupapi import create_cgroup_api, InvalidCgroupMountpointException from azurelinuxagent.ga.firewall_manager import FirewallManager import azurelinuxagent.common.conf as conf import azurelinuxagent.common.event as event import azurelinuxagent.common.logger as logger from azurelinuxagent.common.event import WALAEventOperation from azurelinuxagent.common.future import ustr from azurelinuxagent.ga.logcollector import LogCollector, OUTPUT_RESULTS_FILE_PATH from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.utils import fileutil, textutil from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.version import AGENT_NAME, AGENT_LONG_VERSION, AGENT_VERSION, \ DISTRO_NAME, DISTRO_VERSION, \ PY_VERSION_MAJOR, PY_VERSION_MINOR, \ PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \ get_daemon_version, set_daemon_version from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler from azurelinuxagent.pa.provision.default import ProvisionHandler class AgentCommands(object): """ This is the list of all commands that the Linux Guest Agent supports """ DeprovisionUser = "deprovision+user" Deprovision = "deprovision" Daemon = "daemon" Start = "start" RegisterService = "register-service" RunExthandlers = "run-exthandlers" Version = "version" ShowConfig = "show-configuration" Help = "help" CollectLogs = "collect-logs" SetupFirewall = "setup-firewall" Provision = "provision" class Agent(object): def __init__(self, verbose, conf_file_path=None): """ Initialize agent running environment. """ self.conf_file_path = conf_file_path self.osutil = get_osutil() # Init stdout log level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO logger.add_logger_appender(logger.AppenderType.STDOUT, level) # Init config conf_file_path = self.conf_file_path \ if self.conf_file_path is not None \ else self.osutil.get_agent_conf_file_path() conf.load_conf_from_file(conf_file_path) # Init log verbose = verbose or conf.get_logs_verbose() level = logger.LogLevel.VERBOSE if verbose else logger.LogLevel.INFO logger.add_logger_appender(logger.AppenderType.FILE, level, path=conf.get_agent_log_file()) # echo the log to /dev/console if the machine will be provisioned if conf.get_logs_console() and not ProvisionHandler.is_provisioned(): self.__add_console_appender(level) if event.send_logs_to_telemetry(): logger.add_logger_appender(logger.AppenderType.TELEMETRY, logger.LogLevel.WARNING, path=event.add_log_event) ext_log_dir = conf.get_ext_log_dir() try: if os.path.isfile(ext_log_dir): raise Exception("{0} is a file".format(ext_log_dir)) if not os.path.isdir(ext_log_dir): fileutil.mkdir(ext_log_dir, mode=0o755, owner=self.osutil.get_root_username()) except Exception as e: logger.error( "Exception occurred while creating extension " "log directory {0}: {1}".format(ext_log_dir, e)) # Init event reporter # Note that the reporter is not fully initialized here yet. Some telemetry fields are filled with data # originating from the goal state or IMDS, which requires a WireProtocol instance. Once a protocol # has been established, those fields must be explicitly initialized using # initialize_event_logger_vminfo_common_parameters(). Any events created before that initialization # will contain dummy values on those fields. event.init_event_status(conf.get_lib_dir()) event_dir = os.path.join(conf.get_lib_dir(), event.EVENTS_DIRECTORY) event.init_event_logger(event_dir) event.enable_unhandled_err_dump("WALA") def __add_console_appender(self, level): logger.add_logger_appender(logger.AppenderType.CONSOLE, level, path="/dev/console") def daemon(self): """ Run agent daemon """ set_daemon_version(AGENT_VERSION) logger.set_prefix("Daemon") threading.current_thread().name = "Daemon" child_args = None \ if self.conf_file_path is None \ else "-configuration-path:{0}".format(self.conf_file_path) from azurelinuxagent.daemon import get_daemon_handler daemon_handler = get_daemon_handler() daemon_handler.run(child_args=child_args) def provision(self): """ Run provision command """ from azurelinuxagent.pa.provision import get_provision_handler provision_handler = get_provision_handler() provision_handler.run() def deprovision(self, force=False, deluser=False): """ Run deprovision command """ from azurelinuxagent.pa.deprovision import get_deprovision_handler deprovision_handler = get_deprovision_handler() deprovision_handler.run(force=force, deluser=deluser) def register_service(self): """ Register agent as a service """ print("Register {0} service".format(AGENT_NAME)) self.osutil.register_agent_service() print("Stop {0} service".format(AGENT_NAME)) self.osutil.stop_agent_service() print("Start {0} service".format(AGENT_NAME)) self.osutil.start_agent_service() def run_exthandlers(self, debug=False): """ Run the update and extension handler """ logger.set_prefix("ExtHandler") threading.current_thread().name = "ExtHandler" # # Agents < 2.2.53 used to echo the log to the console. Since the extension handler could have been started by # one of those daemons, output a message indicating that output to the console will stop, otherwise users # may think that the agent died if they noticed that output to the console stops abruptly. # # Feel free to remove this code if telemetry shows there are no more agents <= 2.2.53 in the field. # if conf.get_logs_console() and get_daemon_version() < FlexibleVersion("2.2.53"): self.__add_console_appender(logger.LogLevel.INFO) try: logger.info(u"The agent will now check for updates and then will process extensions. Output to /dev/console will be suspended during those operations.") finally: logger.disable_console_output() from azurelinuxagent.ga.update import get_update_handler update_handler = get_update_handler() update_handler.run(debug) def show_configuration(self): configuration = conf.get_configuration() for k in sorted(configuration.keys()): print("{0} = {1}".format(k, configuration[k])) def collect_logs(self, is_full_mode): logger.set_prefix("LogCollector") if is_full_mode: logger.info("Running log collector mode full") else: logger.info("Running log collector mode normal") LogCollector.initialize_telemetry() # Check the cgroups unit log_collector_monitor = None tracked_controllers = [] if CollectLogsHandler.is_enabled_monitor_cgroups_check(): try: cgroup_api = create_cgroup_api() logger.info("Using cgroup {0} for resource enforcement and monitoring".format(cgroup_api.get_cgroup_version())) except InvalidCgroupMountpointException as e: event.warn(WALAEventOperation.LogCollection, "The agent does not support cgroups if the default systemd mountpoint is not being used: {0}", ustr(e)) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) except CGroupsException as e: event.warn(WALAEventOperation.LogCollection, "Unable to determine which cgroup version to use: {0}", ustr(e)) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) log_collector_cgroup = cgroup_api.get_process_cgroup(process_id="self", cgroup_name=AGENT_LOG_COLLECTOR) tracked_controllers = log_collector_cgroup.get_controllers() for controller in tracked_controllers: logger.info("{0} controller for cgroup: {1}".format(controller.get_controller_type(), controller)) if len(tracked_controllers) != len(log_collector_cgroup.get_supported_controller_names()): event.warn(WALAEventOperation.LogCollection, "At least one required controller is missing. The following controllers are required for the log collector to run: {0}", log_collector_cgroup.get_supported_controller_names()) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) expected_slice = cgroupconfigurator.LOGCOLLECTOR_SLICE if not log_collector_cgroup.check_in_expected_slice(expected_slice): event.warn(WALAEventOperation.LogCollection, "The Log Collector process is not in the proper cgroups. Expected slice: {0}", expected_slice) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) try: log_collector = LogCollector(is_full_mode) # Running log collector resource monitoring only if agent starts the log collector. # If Log collector start by any other means, then it will not be monitored. if CollectLogsHandler.is_enabled_monitor_cgroups_check(): for controller in tracked_controllers: if isinstance(controller, _CpuController): controller.initialize_cpu_usage() break log_collector_monitor = get_log_collector_monitor_handler(tracked_controllers) log_collector_monitor.run() archive, total_uncompressed_size = log_collector.collect_logs_and_get_archive() logger.info("Log collection successfully completed. Archive can be found at {0} " "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH)) if log_collector_monitor is not None: log_collector_monitor.stop() try: metrics_summary = log_collector_monitor.get_max_recorded_metrics() metrics_summary['Total Uncompressed File Size (B)'] = total_uncompressed_size msg = json.dumps(metrics_summary) logger.info(msg) event.add_event(op=event.WALAEventOperation.LogCollection, message=msg, log_event=False) except Exception as e: msg = "An error occurred while reporting log collector resource usage summary: {0}".format(ustr(e)) logger.warn(msg) event.add_event(op=event.WALAEventOperation.LogCollection, is_success=False, message=msg, log_event=False) except Exception as e: logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e))) logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH)) sys.exit(1) finally: if log_collector_monitor is not None: log_collector_monitor.stop() @staticmethod def setup_firewall(endpoint): logger.set_prefix("Firewall") threading.current_thread().name = "Firewall" event.info(event.WALAEventOperation.Firewall, "Setting up firewall after boot. Endpoint: {0}", ustr(endpoint)) try: firewall_manager = FirewallManager.create(endpoint) firewall_manager.setup() event.info(event.WALAEventOperation.Firewall, "Successfully set the firewall rules") except Exception as error: event.error(event.WALAEventOperation.Firewall, "Unable to add firewall rules. Error: {0}", ustr(error)) sys.exit(1) def main(args=None): """ Parse command line arguments, exit with usage() on error. Invoke different methods according to different command """ if args is None: args = [] if len(args) <= 0: args = sys.argv[1:] command, force, verbose, debug, conf_file_path, log_collector_full_mode, firewall_endpoint = parse_args(args) if command == AgentCommands.Version: version() elif command == AgentCommands.Help: print(usage()) elif command == AgentCommands.Start: start(conf_file_path=conf_file_path) else: try: agent = Agent(verbose, conf_file_path=conf_file_path) if command == AgentCommands.DeprovisionUser: agent.deprovision(force, deluser=True) elif command == AgentCommands.Deprovision: agent.deprovision(force, deluser=False) elif command == AgentCommands.Provision: agent.provision() elif command == AgentCommands.RegisterService: agent.register_service() elif command == AgentCommands.Daemon: agent.daemon() elif command == AgentCommands.RunExthandlers: agent.run_exthandlers(debug) elif command == AgentCommands.ShowConfig: agent.show_configuration() elif command == AgentCommands.CollectLogs: agent.collect_logs(log_collector_full_mode) elif command == AgentCommands.SetupFirewall: agent.setup_firewall(firewall_endpoint) except Exception as e: logger.error(u"Failed to run '{0}': {1}", command, textutil.format_exception(e)) def parse_args(sys_args): """ Parse command line arguments """ cmd = AgentCommands.Help force = False verbose = False debug = False conf_file_path = None log_collector_full_mode = False endpoint = None regex_cmd_format = "^([-/]*){0}" for arg in sys_args: if arg == "": # Don't parse an empty parameter continue m = re.match(r"^(?:[-/]*)configuration-path:([\w/\.\-_]+)", arg) if not m is None: conf_file_path = m.group(1) if not os.path.exists(conf_file_path): print("Error: Configuration file {0} does not exist".format( conf_file_path), file=sys.stderr) print(usage()) sys.exit(1) elif re.match("^([-/]*)deprovision\\+user", arg): cmd = AgentCommands.DeprovisionUser elif re.match(regex_cmd_format.format(AgentCommands.Deprovision), arg): cmd = AgentCommands.Deprovision elif re.match(regex_cmd_format.format(AgentCommands.Daemon), arg): cmd = AgentCommands.Daemon elif re.match(regex_cmd_format.format(AgentCommands.Start), arg): cmd = AgentCommands.Start elif re.match(regex_cmd_format.format(AgentCommands.RegisterService), arg): cmd = AgentCommands.RegisterService elif re.match(regex_cmd_format.format(AgentCommands.RunExthandlers), arg): cmd = AgentCommands.RunExthandlers elif re.match(regex_cmd_format.format(AgentCommands.Version), arg): cmd = AgentCommands.Version elif re.match(regex_cmd_format.format("verbose"), arg): verbose = True elif re.match(regex_cmd_format.format("debug"), arg): debug = True elif re.match(regex_cmd_format.format("force"), arg): force = True elif re.match(regex_cmd_format.format(AgentCommands.ShowConfig), arg): cmd = AgentCommands.ShowConfig elif re.match("^([-/]*)(help|usage|\\?)", arg): cmd = AgentCommands.Help elif re.match(regex_cmd_format.format(AgentCommands.CollectLogs), arg): cmd = AgentCommands.CollectLogs elif re.match(regex_cmd_format.format("full"), arg): log_collector_full_mode = True else: regex_cmd = regex_cmd_format.format("{0}=(?P<endpoint>[\\d.]{{7,}})".format(AgentCommands.SetupFirewall)) match = re.match(regex_cmd, arg) if match is not None: cmd = AgentCommands.SetupFirewall endpoint = match.group('endpoint') else: cmd = AgentCommands.Help break return cmd, force, verbose, debug, conf_file_path, log_collector_full_mode, endpoint def version(): """ Show agent version """ print(("{0} running on {1} {2}".format(AGENT_LONG_VERSION, DISTRO_NAME, DISTRO_VERSION))) print("Python: {0}.{1}.{2}".format(PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO)) print("Goal state agent: {0}".format(GOAL_STATE_AGENT_VERSION)) def usage(): """ Return agent usage message """ s = "\n" s += ("usage: {0} [-verbose] [-force] [-help] " "-configuration-path:<path to configuration file>" "-deprovision[+user]|-register-service|-version|-daemon|-start|" "-run-exthandlers|-show-configuration|-collect-logs [-full]|-setup-firewall=<IP>]" "").format(sys.argv[0]) s += "\n" return s def start(conf_file_path=None): """ Start agent daemon in a background process and set stdout/stderr to /dev/null """ args = [sys.argv[0], '-daemon'] if conf_file_path is not None: args.append('-configuration-path:{0}'.format(conf_file_path)) with open(os.devnull, 'w') as devnull: subprocess.Popen(args, stdout=devnull, stderr=devnull) if __name__ == '__main__' : main()