AzureMonitorAgent/agent.py

#!/usr/bin/env python # # AzureMonitoringLinuxAgent Extension # # Copyright 2021 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import sys import os import os.path import datetime import signal import pwd import grp import re import filecmp import stat import traceback import time import platform import subprocess import json import base64 import inspect import shutil import re import hashlib import fileinput import contextlib import ama_tst.modules.install.supported_distros as supported_distros from collections import OrderedDict from hashlib import sha256 from shutil import copyfile from shutil import copytree from shutil import rmtree from threading import Thread import telegraf_utils.telegraf_config_handler as telhandler import metrics_ext_utils.metrics_constants as metrics_constants import metrics_ext_utils.metrics_ext_handler as me_handler import metrics_ext_utils.metrics_common_utils as metrics_utils try: import urllib.request as urllib # Python 3+ except ImportError: import urllib2 as urllib # Python 2 try: from urllib.parse import urlparse # Python 3+ except ImportError: from urlparse import urlparse # Python 2 try: import urllib.error as urlerror # Python 3+ except ImportError: import urllib2 as urlerror # Python 2 # python shim can only make IMDS calls which shouldn't go through proxy try: urllib.getproxies = lambda x = None: {} except Exception as e: print('Resetting proxies failed with error: {0}'.format(e)) try: from Utils.WAAgentUtil import waagent import Utils.HandlerUtil as HUtil except Exception as e: # These utils have checks around the use of them; this is not an exit case print('Importing utils failed with error: {0}'.format(e)) # This code is taken from the omsagent's extension wrapper. # This same monkey patch fix is relevant for AMA extension as well. # This monkey patch duplicates the one made in the waagent import above. # It is necessary because on 2.6, the waagent monkey patch appears to be overridden # by the python-future subprocess.check_output backport. if sys.version_info < (2,7): def check_output(*popenargs, **kwargs): r"""Backport from subprocess module from python 2.7""" if 'stdout' in kwargs: raise ValueError('stdout argument not allowed, it will be overridden.') process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() if retcode: cmd = kwargs.get("args") if cmd is None: cmd = popenargs[0] raise subprocess.CalledProcessError(retcode, cmd, output=output) return output # Exception classes used by this module. class CalledProcessError(Exception): def __init__(self, returncode, cmd, output=None): self.returncode = returncode self.cmd = cmd self.output = output def __str__(self): return "Command '%s' returned non-zero exit status %d" % (self.cmd, self.returncode) subprocess.check_output = check_output subprocess.CalledProcessError = CalledProcessError # Global Variables PackagesDirectory = 'packages' # The BundleFileName values will be replaced by actual values in the release pipeline. See apply_version.sh. BundleFileNameDeb = 'azuremonitoragent.deb' BundleFileNameRpm = 'azuremonitoragent.rpm' BundleFileName = '' TelegrafBinName = 'telegraf' InitialRetrySleepSeconds = 30 PackageManager = '' PackageManagerOptions = '' MdsdCounterJsonPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/metricCounters.json' FluentCfgPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/fluentbit/td-agent.conf' AMASyslogConfigMarkerPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/syslog.marker' AMASyslogPortFilePath = '/etc/opt/microsoft/azuremonitoragent/config-cache/syslog.port' AMAFluentPortFilePath = '/etc/opt/microsoft/azuremonitoragent/config-cache/fluent.port' PreviewFeaturesDirectory = '/etc/opt/microsoft/azuremonitoragent/config-cache/previewFeatures/' ArcSettingsFile = '/var/opt/azcmagent/localconfig.json' AMAAstTransformConfigMarkerPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/agenttransform.marker' AMAExtensionLogRotateFilePath = '/etc/logrotate.d/azuremonitoragentextension' SupportedArch = set(['x86_64', 'aarch64']) # Error codes GenericErrorCode = 1 UnsupportedOperatingSystem = 51 IndeterminateOperatingSystem = 51 MissingorInvalidParameterErrorCode = 53 DPKGOrRPMLockedErrorCode = 56 MissingDependency = 52 # Settings GenevaConfigKey = "genevaConfiguration" AzureMonitorConfigKey = "azureMonitorConfiguration" # Configuration HUtilObject = None SettingsSequenceNumber = None HandlerEnvironment = None SettingsDict = None def main(): """ Main method Parse out operation from argument, invoke the operation, and finish. """ init_waagent_logger() waagent_log_info('Azure Monitoring Agent for Linux started to handle.') # Determine the operation being executed operation = None try: option = sys.argv[1] if re.match('^([-/]*)(disable)', option): operation = 'Disable' elif re.match('^([-/]*)(uninstall)', option): operation = 'Uninstall' elif re.match('^([-/]*)(install)', option): operation = 'Install' elif re.match('^([-/]*)(enable)', option): operation = 'Enable' elif re.match('^([-/]*)(update)', option): operation = 'Update' elif re.match('^([-/]*)(metrics)', option): operation = 'Metrics' elif re.match('^([-/]*)(syslogconfig)', option): operation = 'Syslogconfig' elif re.match('^([-/]*)(transformconfig)', option): operation = 'Transformconfig' except Exception as e: waagent_log_error(str(e)) if operation is None: log_and_exit('Unknown', GenericErrorCode, 'No valid operation provided') # Set up for exit code and any error messages exit_code = 0 message = '{0} succeeded'.format(operation) # Avoid entering broken state where manual purge actions are necessary in low disk space scenario destructive_operations = ['Disable', 'Uninstall'] if operation not in destructive_operations: exit_code = check_disk_space_availability() if exit_code != 0: message = '{0} failed due to low disk space'.format(operation) log_and_exit(operation, exit_code, message) # Invoke operation try: global HUtilObject HUtilObject = parse_context(operation) exit_code, output = operations[operation]() # Exit code 1 indicates a general problem that doesn't have a more # specific error code; it often indicates a missing dependency if exit_code == 1 and operation == 'Install': message = 'Install failed with exit code 1. For error details, check logs ' \ 'in /var/log/azure/Microsoft.Azure.Monitor' \ '.AzureMonitorLinuxAgent' elif exit_code is DPKGOrRPMLockedErrorCode and operation == 'Install': message = 'Install failed with exit code {0} because the ' \ 'package manager on the VM is currently locked: ' \ 'please wait and try again'.format(DPKGOrRPMLockedErrorCode) elif exit_code != 0: message = '{0} failed with exit code {1} {2}'.format(operation, exit_code, output) except AzureMonitorAgentForLinuxException as e: exit_code = e.error_code message = e.get_error_message(operation) except Exception as e: exit_code = GenericErrorCode message = '{0} failed with error: {1}\n' \ 'Stacktrace: {2}'.format(operation, e, traceback.format_exc()) # Finish up and log messages log_and_exit(operation, exit_code, message) def check_disk_space_availability(): """ Check if there is the required space on the machine. """ try: if get_free_space_mb("/var") < 500 or get_free_space_mb("/etc") < 500 or get_free_space_mb("/opt") < 500 : # 52 is the exit code for missing dependency i.e. disk space # https://github.com/Azure/azure-marketplace/wiki/Extension-Build-Notes-Best-Practices#error-codes-and-messages-output-to-stderr return MissingDependency else: return 0 except: print('Failed to check disk usage.') return 0 def get_free_space_mb(dirname): """ Get the free space in MB in the directory path. """ st = os.statvfs(dirname) return (st.f_bavail * st.f_frsize) // (1024 * 1024) def is_systemd(): """ Check if the system is using systemd """ return os.path.isdir("/run/systemd/system") def get_service_command(service, *operations): """ Get the appropriate service command [sequence] for the provided service name and operation(s) """ if is_systemd(): return " && ".join(["systemctl {0} {1}".format(operation, service) for operation in operations]) else: hutil_log_info("The VM doesn't have systemctl. Using the init.d service to start {0}.".format(service)) return '/etc/init.d/{0} {1}'.format(service, operations[0]) def check_kill_process(pstring): for line in os.popen("ps ax | grep " + pstring + " | grep -v grep"): fields = line.split() pid = fields[0] os.kill(int(pid), signal.SIGKILL) def compare_and_copy_bin(src, dest): # Check if previous file exist at the location, compare the two binaries, # If the files are not same, remove the older file, and copy the new one # If they are the same, then we ignore it and don't copy if os.path.isfile(src ): if os.path.isfile(dest): if not filecmp.cmp(src, dest): # Removing the file in case it is already being run in a process, # in which case we can get an error "text file busy" while copying os.remove(dest) copyfile(src, dest) else: # No previous binary exist, simply copy it and make it executable copyfile(src, dest) os.chmod(dest, stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH) def copy_amacoreagent_binaries(): current_arch = platform.machine() amacoreagent_bin_local_path = os.getcwd() + "/amaCoreAgentBin/amacoreagent_" + current_arch amacoreagent_bin = "/opt/microsoft/azuremonitoragent/bin/amacoreagent" compare_and_copy_bin(amacoreagent_bin_local_path, amacoreagent_bin) if current_arch == 'x86_64': libgrpc_bin_local_path = os.getcwd() + "/amaCoreAgentBin/libgrpc_csharp_ext.x86_64.so" libgrpc_bin = "/opt/microsoft/azuremonitoragent/bin/libgrpc_csharp_ext.x86_64.so" compare_and_copy_bin(libgrpc_bin_local_path, libgrpc_bin) liblz4x64_bin_local_path = os.getcwd() + "/amaCoreAgentBin/liblz4x64.so" liblz4x64_bin = "/opt/microsoft/azuremonitoragent/bin/liblz4x64.so" compare_and_copy_bin(liblz4x64_bin_local_path, liblz4x64_bin) elif current_arch == 'aarch64': libgrpc_bin_local_path = os.getcwd() + "/amaCoreAgentBin/libgrpc_csharp_ext.arm64.so" libgrpc_bin = "/opt/microsoft/azuremonitoragent/bin/libgrpc_csharp_ext.arm64.so" compare_and_copy_bin(libgrpc_bin_local_path, libgrpc_bin) agentlauncher_bin_local_path = os.getcwd() + "/agentLauncherBin/agentlauncher_" + current_arch agentlauncher_bin = "/opt/microsoft/azuremonitoragent/bin/agentlauncher" compare_and_copy_bin(agentlauncher_bin_local_path, agentlauncher_bin) def copy_mdsd_fluentbit_binaries(): current_arch = platform.machine() mdsd_bin_local_path = os.getcwd() + "/mdsdBin/mdsd_" + current_arch mdsdmgr_bin_local_path = os.getcwd() + "/mdsdBin/mdsdmgr_" + current_arch fluentbit_bin_local_path = os.getcwd() + "/fluentBitBin/fluent-bit_" + current_arch mdsd_bin = "/opt/microsoft/azuremonitoragent/bin/mdsd" mdsdmgr_bin = "/opt/microsoft/azuremonitoragent/bin/mdsdmgr" fluentbit_bin = "/opt/microsoft/azuremonitoragent/bin/fluent-bit" # copy the required libs to our test directory first copytree("/opt/microsoft/azuremonitoragent/lib", os.getcwd() + "/lib") canUseSharedmdsd, _ = run_command_and_log('ldd ' + mdsd_bin_local_path + ' | grep "not found"') canUseSharedmdsdmgr, _ = run_command_and_log('ldd ' + mdsdmgr_bin_local_path + ' | grep "not found"') if canUseSharedmdsd != 0 and canUseSharedmdsdmgr != 0: compare_and_copy_bin(mdsd_bin_local_path, mdsd_bin) compare_and_copy_bin(mdsdmgr_bin_local_path, mdsdmgr_bin) canUseSharedfluentbit, _ = run_command_and_log('ldd ' + fluentbit_bin_local_path + ' | grep "not found"') if canUseSharedfluentbit != 0: compare_and_copy_bin(fluentbit_bin_local_path, fluentbit_bin) rmtree(os.getcwd() + "/lib") def install(): """ Ensure that this VM distro and version are supported. Install the Azure Monitor Linux Agent package, using retries. Note: install operation times out from WAAgent at 15 minutes, so do not wait longer. """ exit_if_vm_not_supported('Install') find_package_manager("Install") set_os_arch('Install') vm_dist, vm_ver = find_vm_distro('Install') # Check if Debian 12 VMs have rsyslog package (required for AMA 1.31+) if (vm_dist.startswith('debian')) and vm_ver.startswith('12'): check_rsyslog, _ = run_command_and_log("dpkg -s rsyslog") if check_rsyslog != 0: hutil_log_info("'rsyslog' package missing from Debian 12 machine, installing to allow AMA to run.") rsyslog_exit_code, rsyslog_output = run_command_and_log("DEBIAN_FRONTEND=noninteractive apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y rsyslog") if rsyslog_exit_code != 0: return rsyslog_exit_code, rsyslog_output # Check if Amazon 2023 VMs have rsyslog package (required for AMA 1.31+) if (vm_dist.startswith('amzn')) and vm_ver.startswith('2023'): check_rsyslog, _ = run_command_and_log("dnf list installed | grep rsyslog.x86_64") if check_rsyslog != 0: hutil_log_info("'rsyslog' package missing from Amazon Linux 2023 machine, installing to allow AMA to run.") rsyslog_exit_code, rsyslog_output = run_command_and_log("dnf install -y rsyslog") if rsyslog_exit_code != 0: return rsyslog_exit_code, rsyslog_output package_directory = os.path.join(os.getcwd(), PackagesDirectory) bundle_path = os.path.join(package_directory, BundleFileName) os.chmod(bundle_path, 100) print(PackageManager, " and ", BundleFileName) AMAInstallCommand = "{0} {1} -i {2}".format(PackageManager, PackageManagerOptions, bundle_path) hutil_log_info('Running command "{0}"'.format(AMAInstallCommand)) # Retry, since install can fail due to concurrent package operations exit_code, output = run_command_with_retries_output(AMAInstallCommand, retries = 15, retry_check = retry_if_dpkg_or_rpm_locked, final_check = final_check_if_dpkg_or_rpm_locked) # Retry install for aarch64 rhel8 VMs as initial install fails to create symlink to /etc/systemd/system/azuremonitoragent.service # in /etc/systemd/system/multi-user.target.wants/azuremonitoragent.service if vm_dist.replace(' ','').lower().startswith('redhat') and vm_ver == '8.6' and platform.machine() == 'aarch64': exit_code, output = run_command_with_retries_output(AMAInstallCommand, retries = 15, retry_check = retry_if_dpkg_or_rpm_locked, final_check = final_check_if_dpkg_or_rpm_locked) if exit_code != 0: return exit_code, output # System daemon reload is required for systemd to pick up the new service exit_code, output = run_command_and_log("systemctl daemon-reload") if exit_code != 0: return exit_code, output # Copy the AMACoreAgent and agentlauncher binaries copy_amacoreagent_binaries() # Copy KqlExtension binaries # Needs to be revisited for aarch64 copy_kqlextension_binaries() # Copy mdsd and fluent-bit with OpenSSL dynamically linked if is_feature_enabled('useDynamicSSL'): # Check if they have libssl.so.1.1 since AMA is built against this version libssl1_1, _ = run_command_and_log('ldconfig -p | grep libssl.so.1.1') if libssl1_1 == 0: copy_mdsd_fluentbit_binaries() # Set task limits to max of 65K in suse 12 # Based on Task 9764411: AMA broken after 1.7 in sles 12 - https://dev.azure.com/msazure/One/_workitems/edit/9764411 if exit_code == 0: vm_dist, _ = find_vm_distro('Install') if (vm_dist.startswith('suse') or vm_dist.startswith('sles')): try: suse_exit_code, suse_output = run_command_and_log("mkdir -p /etc/systemd/system/azuremonitoragent.service.d") if suse_exit_code != 0: return suse_exit_code, suse_output suse_exit_code, suse_output = run_command_and_log("echo '[Service]' > /etc/systemd/system/azuremonitoragent.service.d/override.conf") if suse_exit_code != 0: return suse_exit_code, suse_output suse_exit_code, suse_output = run_command_and_log("echo 'TasksMax=65535' >> /etc/systemd/system/azuremonitoragent.service.d/override.conf") if suse_exit_code != 0: return suse_exit_code, suse_output suse_exit_code, suse_output = run_command_and_log("systemctl daemon-reload") if suse_exit_code != 0: return suse_exit_code, suse_output except: log_and_exit("install", MissingorInvalidParameterErrorCode, "Failed to update /etc/systemd/system/azuremonitoragent.service.d for suse 12,15" ) return exit_code, output def uninstall(): """ Uninstall the Azure Monitor Linux Agent. This is a somewhat soft uninstall. It is not a purge. Note: uninstall operation times out from WAAgent at 5 minutes """ exit_if_vm_not_supported('Uninstall') find_package_manager("Uninstall") AMAUninstallCommand = "" if PackageManager == "dpkg": AMAUninstallCommand = "dpkg -P azuremonitoragent" elif PackageManager == "rpm": AMAUninstallCommand = "rpm -e azuremonitoragent" else: log_and_exit("Uninstall", UnsupportedOperatingSystem, "The OS has neither rpm nor dpkg" ) hutil_log_info('Running command "{0}"'.format(AMAUninstallCommand)) remove_localsyslog_configs() # remove the logrotate config if os.path.exists(AMAExtensionLogRotateFilePath): try: os.remove(AMAExtensionLogRotateFilePath) except Exception as ex: output = 'Logrotate removal failed with error: {0}\n' \ 'Stacktrace: {1}'.format(ex, traceback.format_exc()) hutil_log_info(output) # Retry, since uninstall can fail due to concurrent package operations try: exit_code, output = run_command_with_retries_output(AMAUninstallCommand, retries = 4, retry_check = retry_if_dpkg_or_rpm_locked, final_check = final_check_if_dpkg_or_rpm_locked) except Exception as ex: exit_code = GenericErrorCode output = 'Uninstall failed with error: {0}\n' \ 'Stacktrace: {1}'.format(ex, traceback.format_exc()) return exit_code, output def enable(): """ Start the Azure Monitor Linux Agent Service This call will return non-zero or throw an exception if the settings provided are incomplete or incorrect. Note: enable operation times out from WAAgent at 5 minutes """ public_settings, protected_settings = get_settings() exit_if_vm_not_supported('Enable') ensure = OrderedDict([ ("azuremonitoragent", False), ("azuremonitoragentmgr", False) ]) # Set traceFlags in publicSettings to enable mdsd tracing. For example, the EventIngest flag can be enabled via "traceFlags": "0x2" flags = "" if public_settings is not None and "traceFlags" in public_settings: flags = "-T {} ".format(public_settings.get("traceFlags")) # Use an Ordered Dictionary to ensure MDSD_OPTIONS (and other dependent variables) are written after their dependencies default_configs = OrderedDict([ ("MDSD_CONFIG_DIR", "/etc/opt/microsoft/azuremonitoragent"), ("MDSD_LOG_DIR", "/var/opt/microsoft/azuremonitoragent/log"), ("MDSD_ROLE_PREFIX", "/run/azuremonitoragent/default"), ("MDSD_SPOOL_DIRECTORY", "/var/opt/microsoft/azuremonitoragent"), ("MDSD_OPTIONS", "\"{}-A -R -c /etc/opt/microsoft/azuremonitoragent/mdsd.xml -d -r $MDSD_ROLE_PREFIX -S $MDSD_SPOOL_DIRECTORY/eh -L $MDSD_SPOOL_DIRECTORY/events\"".format(flags)), ("MDSD_USE_LOCAL_PERSISTENCY", "true"), ("MDSD_TCMALLOC_RELEASE_FREQ_SEC", "1"), ("MONITORING_USE_GENEVA_CONFIG_SERVICE", "false"), ("ENABLE_MCS", "false") ]) ssl_cert_var_name, ssl_cert_var_value = get_ssl_cert_info('Enable') default_configs[ssl_cert_var_name] = ssl_cert_var_value """ Decide the mode and configuration. There are two supported configuration schema, mix-and-match between schemas is disallowed: Legacy: allows one of [MCS, GCS single tenant, or GCS multi tenant ("Auto-Config")] modes Next-Generation: allows MCS, GCS multi tenant, or both """ is_gcs_single_tenant = False GcsEnabled, McsEnabled = get_control_plane_mode() # Next-generation schema if public_settings is not None and (public_settings.get(GenevaConfigKey) or public_settings.get(AzureMonitorConfigKey)): geneva_configuration = public_settings.get(GenevaConfigKey) azure_monitor_configuration = public_settings.get(AzureMonitorConfigKey) # Check for mix-and match of next-generation and legacy schema content if len(public_settings) > 1 and ((geneva_configuration and not azure_monitor_configuration) or (azure_monitor_configuration and not geneva_configuration)): log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Mixing genevaConfiguration or azureMonitorConfiguration with other configuration schemas is not allowed') if geneva_configuration and geneva_configuration.get("enable") == True: hutil_log_info("Detected Geneva+ mode; azuremonitoragentmgr service will be started to handle Geneva tenants") ensure["azuremonitoragentmgr"] = True if azure_monitor_configuration and azure_monitor_configuration.get("enable") == True: hutil_log_info("Detected Azure Monitor+ mode; azuremonitoragent service will be started to handle Azure Monitor tenant") ensure["azuremonitoragent"] = True azure_monitor_public_settings = azure_monitor_configuration.get("configuration") azure_monitor_protected_settings = protected_settings.get(AzureMonitorConfigKey) if protected_settings is not None else None handle_mcs_config(azure_monitor_public_settings, azure_monitor_protected_settings, default_configs) # Legacy schema elif public_settings is not None and public_settings.get("GCS_AUTO_CONFIG") == True: hutil_log_info("Detected Auto-Config mode; azuremonitoragentmgr service will be started to handle Geneva tenants") ensure["azuremonitoragentmgr"] = True elif (protected_settings is None or len(protected_settings) == 0) or (public_settings is not None and "proxy" in public_settings and "mode" in public_settings.get("proxy") and public_settings.get("proxy").get("mode") == "application"): hutil_log_info("Detected Azure Monitor mode; azuremonitoragent service will be started to handle Azure Monitor configuration") ensure["azuremonitoragent"] = True handle_mcs_config(public_settings, protected_settings, default_configs) else: hutil_log_info("Detected Geneva mode; azuremonitoragent service will be started to handle Geneva configuration") ensure["azuremonitoragent"] = True is_gcs_single_tenant = True handle_gcs_config(public_settings, protected_settings, default_configs) # generate local syslog configuration files as in auto config syslog is not driven from DCR # Note that internally AMCS with geneva config path can be used in which case syslog should be handled same way as default 1P # generate local syslog configuration files as in 1P syslog is not driven from DCR if GcsEnabled: generate_localsyslog_configs(uses_gcs=True, uses_mcs=McsEnabled) config_file = "/etc/default/azuremonitoragent" temp_config_file = "/etc/default/azuremonitoragent_temp" try: if os.path.isfile(config_file): new_config = "\n".join(["export {0}={1}".format(key, value) for key, value in default_configs.items()]) + "\n" with open(temp_config_file, "w") as f: f.write(new_config) if not os.path.isfile(temp_config_file): log_and_exit("Enable", GenericErrorCode, "Error while updating environment variables in {0}".format(config_file)) os.remove(config_file) os.rename(temp_config_file, config_file) else: log_and_exit("Enable", GenericErrorCode, "Could not find the file {0}".format(config_file)) except Exception as e: log_and_exit("Enable", GenericErrorCode, "Failed to add environment variables to {0}: {1}".format(config_file, e)) if "ENABLE_MCS" in default_configs and default_configs["ENABLE_MCS"] == "true": # enable processes for Custom Logs ensure["azuremonitor-agentlauncher"] = True ensure["azuremonitor-coreagent"] = True # start the metrics, agent transform and syslog watchers only in 3P mode start_metrics_process() start_syslogconfig_process() elif ensure.get("azuremonitoragentmgr") or is_gcs_single_tenant: # In GCS scenarios, ensure that AMACoreAgent is running ensure["azuremonitor-coreagent"] = True hutil_log_info('Handler initiating onboarding.') if HUtilObject and HUtilObject.is_seq_smaller(): # Either upgrade has just happened (in which case we need to start), or enable was called with no change to extension config hutil_log_info("Current sequence number, " + HUtilObject._context._seq_no + ", is not greater than the LKG sequence number. Starting service(s) only if it is not yet running.") operations = ["start", "enable"] else: # Either this is a clean install (in which case restart is effectively start), or extension config has changed hutil_log_info("Current sequence number, " + HUtilObject._context._seq_no + ", is greater than the LKG sequence number. Restarting service(s) to pick up the new config.") operations = ["restart", "enable"] output = "" # Ensure non-required services are not running; do not block if this step fails for service in [s for s in ensure.keys() if not ensure[s]]: exit_code, disable_output = run_command_and_log(get_service_command(service, "stop", "disable")) output += disable_output for service in [s for s in ensure.keys() if ensure[s]]: exit_code, enable_output = run_command_and_log(get_service_command(service, *operations)) output += enable_output if exit_code != 0: status_command = get_service_command(service, "status") status_exit_code, status_output = run_command_and_log(status_command) if status_exit_code != 0: output += "Output of '{0}':\n{1}".format(status_command, status_output) return exit_code, output if platform.machine() != 'aarch64': if "ENABLE_MCS" in default_configs and default_configs["ENABLE_MCS"] == "true": # start/enable kql extension only in 3P mode and non aarch64 kql_start_code, kql_output = run_command_and_log(get_service_command("azuremonitor-kqlextension", *operations)) output += kql_output # do not block if kql start fails # start transformation config watcher process start_transformconfig_process() # Service(s) were successfully configured and started; increment sequence number HUtilObject.save_seq() return exit_code, output def handle_gcs_config(public_settings, protected_settings, default_configs): """ Populate the defaults for legacy-path GCS mode """ # look for LA protected settings for var in list(protected_settings.keys()): if "_key" in var or "_id" in var: default_configs[var] = protected_settings.get(var) # check if required GCS params are available MONITORING_GCS_CERT_CERTFILE = None if "certificate" in protected_settings: MONITORING_GCS_CERT_CERTFILE = base64.standard_b64decode(protected_settings.get("certificate")) if "certificatePath" in protected_settings: try: with open(protected_settings.get("certificatePath"), 'r') as f: MONITORING_GCS_CERT_CERTFILE = f.read() except Exception as ex: log_and_exit('Enable', MissingorInvalidParameterErrorCode, 'Failed to read certificate {0}: {1}'.format(protected_settings.get("certificatePath"), ex)) MONITORING_GCS_CERT_KEYFILE = None if "certificateKey" in protected_settings: MONITORING_GCS_CERT_KEYFILE = base64.standard_b64decode(protected_settings.get("certificateKey")) if "certificateKeyPath" in protected_settings: try: with open(protected_settings.get("certificateKeyPath"), 'r') as f: MONITORING_GCS_CERT_KEYFILE = f.read() except Exception as ex: log_and_exit('Enable', MissingorInvalidParameterErrorCode, 'Failed to read certificate key {0}: {1}'.format(protected_settings.get("certificateKeyPath"), ex)) MONITORING_GCS_ENVIRONMENT = "" if "monitoringGCSEnvironment" in protected_settings: MONITORING_GCS_ENVIRONMENT = protected_settings.get("monitoringGCSEnvironment") MONITORING_GCS_NAMESPACE = "" if "namespace" in protected_settings: MONITORING_GCS_NAMESPACE = protected_settings.get("namespace") MONITORING_GCS_ACCOUNT = "" if "monitoringGCSAccount" in protected_settings: MONITORING_GCS_ACCOUNT = protected_settings.get("monitoringGCSAccount") MONITORING_GCS_REGION = "" if "monitoringGCSRegion" in protected_settings: MONITORING_GCS_REGION = protected_settings.get("monitoringGCSRegion") MONITORING_CONFIG_VERSION = "" if "configVersion" in protected_settings: MONITORING_CONFIG_VERSION = protected_settings.get("configVersion") MONITORING_GCS_AUTH_ID_TYPE = "" if "monitoringGCSAuthIdType" in protected_settings: MONITORING_GCS_AUTH_ID_TYPE = protected_settings.get("monitoringGCSAuthIdType") MONITORING_GCS_AUTH_ID = "" if "monitoringGCSAuthId" in protected_settings: MONITORING_GCS_AUTH_ID = protected_settings.get("monitoringGCSAuthId") MONITORING_TENANT = "" if "monitoringTenant" in protected_settings: MONITORING_TENANT = protected_settings.get("monitoringTenant") MONITORING_ROLE = "" if "monitoringRole" in protected_settings: MONITORING_ROLE = protected_settings.get("monitoringRole") MONITORING_ROLE_INSTANCE = "" if "monitoringRoleInstance" in protected_settings: MONITORING_ROLE_INSTANCE = protected_settings.get("monitoringRoleInstance") if ((MONITORING_GCS_CERT_CERTFILE is None or MONITORING_GCS_CERT_KEYFILE is None) and (MONITORING_GCS_AUTH_ID_TYPE == "")) or MONITORING_GCS_ENVIRONMENT == "" or MONITORING_GCS_NAMESPACE == "" or MONITORING_GCS_ACCOUNT == "" or MONITORING_GCS_REGION == "" or MONITORING_CONFIG_VERSION == "": log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Not all required GCS parameters are provided') else: # set the values for GCS default_configs["MONITORING_USE_GENEVA_CONFIG_SERVICE"] = "true" default_configs["MONITORING_GCS_ENVIRONMENT"] = MONITORING_GCS_ENVIRONMENT default_configs["MONITORING_GCS_NAMESPACE"] = MONITORING_GCS_NAMESPACE default_configs["MONITORING_GCS_ACCOUNT"] = MONITORING_GCS_ACCOUNT default_configs["MONITORING_GCS_REGION"] = MONITORING_GCS_REGION default_configs["MONITORING_CONFIG_VERSION"] = MONITORING_CONFIG_VERSION # write the certificate and key to disk uid = pwd.getpwnam("syslog").pw_uid gid = grp.getgrnam("syslog").gr_gid if MONITORING_GCS_AUTH_ID_TYPE != "": default_configs["MONITORING_GCS_AUTH_ID_TYPE"] = MONITORING_GCS_AUTH_ID_TYPE if MONITORING_GCS_AUTH_ID != "": default_configs["MONITORING_GCS_AUTH_ID"] = MONITORING_GCS_AUTH_ID if MONITORING_GCS_CERT_CERTFILE is not None: default_configs["MONITORING_GCS_CERT_CERTFILE"] = "/etc/opt/microsoft/azuremonitoragent/gcscert.pem" with open("/etc/opt/microsoft/azuremonitoragent/gcscert.pem", "wb") as f: f.write(MONITORING_GCS_CERT_CERTFILE) os.chown("/etc/opt/microsoft/azuremonitoragent/gcscert.pem", uid, gid) os.system('chmod {1} {0}'.format("/etc/opt/microsoft/azuremonitoragent/gcscert.pem", 400)) if MONITORING_GCS_CERT_KEYFILE is not None: default_configs["MONITORING_GCS_CERT_KEYFILE"] = "/etc/opt/microsoft/azuremonitoragent/gcskey.pem" with open("/etc/opt/microsoft/azuremonitoragent/gcskey.pem", "wb") as f: f.write(MONITORING_GCS_CERT_KEYFILE) os.chown("/etc/opt/microsoft/azuremonitoragent/gcskey.pem", uid, gid) os.system('chmod {1} {0}'.format("/etc/opt/microsoft/azuremonitoragent/gcskey.pem", 400)) if MONITORING_TENANT != "": default_configs["MONITORING_TENANT"] = MONITORING_TENANT if MONITORING_ROLE != "": default_configs["MONITORING_ROLE"] = MONITORING_ROLE if MONITORING_TENANT != "": default_configs["MONITORING_ROLE_INSTANCE"] = MONITORING_ROLE_INSTANCE def handle_mcs_config(public_settings, protected_settings, default_configs): """ Populate the defaults for MCS mode """ default_configs["ENABLE_MCS"] = "true" default_configs["PA_GIG_BRIDGE_MODE"] = "true" # April 2022: PA_FLUENT_SOCKET_PORT setting is being deprecated in place of PA_DATA_PORT. Remove when AMA 1.17 and earlier no longer need servicing. default_configs["PA_FLUENT_SOCKET_PORT"] = "13005" # this port will be dynamic in future default_configs["PA_DATA_PORT"] = "13005" proxySet = False # fetch proxy settings if public_settings is not None and "proxy" in public_settings and "mode" in public_settings.get("proxy") and public_settings.get("proxy").get("mode") == "application": default_configs["MDSD_PROXY_MODE"] = "application" if "address" in public_settings.get("proxy"): default_configs["MDSD_PROXY_ADDRESS"] = public_settings.get("proxy").get("address") else: log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Parameter "address" is required in proxy public setting') if "auth" in public_settings.get("proxy") and public_settings.get("proxy").get("auth") == True: if protected_settings is not None and "proxy" in protected_settings and "username" in protected_settings.get("proxy") and "password" in protected_settings.get("proxy"): default_configs["MDSD_PROXY_USERNAME"] = protected_settings.get("proxy").get("username") default_configs["MDSD_PROXY_PASSWORD"] = protected_settings.get("proxy").get("password") set_proxy(default_configs["MDSD_PROXY_ADDRESS"], default_configs["MDSD_PROXY_USERNAME"], default_configs["MDSD_PROXY_PASSWORD"]) proxySet = True else: log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Parameter "username" and "password" not in proxy protected setting') else: set_proxy(default_configs["MDSD_PROXY_ADDRESS"], "", "") proxySet = True # is this Arc? If so, check for proxy if os.path.isfile(ArcSettingsFile): f = open(ArcSettingsFile, "r") data = f.read() if (data != ''): json_data = json.loads(data) BypassProxy = False if json_data is not None and "proxy.bypass" in json_data: bypass = json_data["proxy.bypass"] # proxy.bypass is an array if "AMA" in bypass: BypassProxy = True if not BypassProxy and json_data is not None and "proxy.url" in json_data: url = json_data["proxy.url"] # only non-authenticated proxy config is supported if url != '': default_configs["MDSD_PROXY_ADDRESS"] = url set_proxy(default_configs["MDSD_PROXY_ADDRESS"], "", "") proxySet = True if not proxySet: unset_proxy() # set arc autonomous endpoints az_environment, _ = get_azure_environment_and_region() if az_environment == me_handler.ArcACloudName: try: _, mcs_endpoint = me_handler.get_arca_endpoints_from_himds() except Exception as ex: log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Failed to get Arc autonomous endpoints. {0}'.format(ex)) default_configs["customRegionalEndpoint"] = mcs_endpoint default_configs["customGlobalEndpoint"] = mcs_endpoint default_configs["customResourceEndpoint"] = "https://monitoring.azs" # add managed identity settings if they were provided identifier_name, identifier_value, error_msg = get_managed_identity() if error_msg: log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Failed to determine managed identity settings. {0}.'.format(error_msg)) if identifier_name and identifier_value: default_configs["MANAGED_IDENTITY"] = "{0}#{1}".format(identifier_name, identifier_value) def get_control_plane_mode(): """ Identify which control plane is in use """ public_settings, protected_settings = get_settings() GcsEnabled = False McsEnabled = False if public_settings is not None and (public_settings.get(GenevaConfigKey) or public_settings.get(AzureMonitorConfigKey)): geneva_configuration = public_settings.get(GenevaConfigKey) azure_monitor_configuration = public_settings.get(AzureMonitorConfigKey) if geneva_configuration and geneva_configuration.get("enable") == True: GcsEnabled = True if azure_monitor_configuration and azure_monitor_configuration.get("enable") == True: McsEnabled = True # Legacy schema elif public_settings is not None and public_settings.get("GCS_AUTO_CONFIG") == True: GcsEnabled = True elif (protected_settings is None or len(protected_settings) == 0) or (public_settings is not None and "proxy" in public_settings and "mode" in public_settings.get("proxy") and public_settings.get("proxy").get("mode") == "application"): McsEnabled = True else: GcsEnabled = True return GcsEnabled, McsEnabled def disable(): """ Disable Azure Monitor Linux Agent process on the VM. Note: disable operation times out from WAAgent at 15 minutes """ #stop the metrics process stop_metrics_process() #stop syslog config watcher process stop_syslogconfig_process() #stop agent transform config watcher process stop_transformconfig_process() # stop amacoreagent and agent launcher hutil_log_info('Handler initiating Core Agent and agent launcher') if is_systemd(): exit_code, output = run_command_and_log('systemctl stop azuremonitor-coreagent && systemctl disable azuremonitor-coreagent') exit_code, output = run_command_and_log('systemctl stop azuremonitor-agentlauncher && systemctl disable azuremonitor-agentlauncher') # in case AL is not cleaning up properly check_kill_process('/opt/microsoft/azuremonitoragent/bin/fluent-bit') # Stop and disable systemd services so they are not started after system reboot. for service in ["azuremonitoragent", "azuremonitoragentmgr"]: exit_code, output = run_command_and_log(get_service_command(service, "stop", "disable")) if exit_code != 0: status_command = get_service_command(service, "status") status_exit_code, status_output = run_command_and_log(status_command) if status_exit_code != 0: output += "Output of '{0}':\n{1}".format(status_command, status_output) if platform.machine() != 'aarch64': # stop kql extensionso that is not started after system reboot. Do not block if it fails. kql_exit_code, disable_output = run_command_and_log(get_service_command("azuremonitor-kqlextension", "stop", "disable")) if kql_exit_code != 0: status_command = get_service_command("azuremonitor-kqlextension", "status") kql_exit_code, kql_status_output = run_command_and_log(status_command) hutil_log_info(kql_status_output) return exit_code, output def update(): """ Update the current installation of AzureMonitorLinuxAgent No logic to install the agent as agent -> install() will be called with update because upgradeMode = "UpgradeWithInstall" set in HandlerManifest """ return 0, "" def restart_launcher(): # start agent launcher hutil_log_info('Handler initiating agent launcher') if is_systemd(): exit_code, output = run_command_and_log('systemctl restart azuremonitor-agentlauncher && systemctl enable azuremonitor-agentlauncher') def restart_kqlextension(): # start agent transformation extension process hutil_log_info('Handler initiating agent transformation extension (KqlExtension) restart and enable') if is_systemd(): exit_code, output = run_command_and_log('systemctl restart azuremonitor-kqlextension && systemctl enable azuremonitor-kqlextension') def set_proxy(address, username, password): """ # Set proxy http_proxy env var in dependent services """ try: http_proxy = address address = address.replace("http://","") if username: http_proxy = "http://" + username + ":" + password + "@" + address # Update Coreagent run_command_and_log("mkdir -p /etc/systemd/system/azuremonitor-coreagent.service.d") run_command_and_log("echo '[Service]' > /etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf") run_command_and_log("echo 'Environment=\"http_proxy={0}\"' >> /etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf".format(http_proxy)) run_command_and_log("echo 'Environment=\"https_proxy={0}\"' >> /etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf".format(http_proxy)) os.system('chmod {1} {0}'.format("/etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf", 400)) # Update ME run_command_and_log("mkdir -p /etc/systemd/system/metrics-extension.service.d") run_command_and_log("echo '[Service]' > /etc/systemd/system/metrics-extension.service.d/proxy.conf") run_command_and_log("echo 'Environment=\"http_proxy={0}\"' >> /etc/systemd/system/metrics-extension.service.d/proxy.conf".format(http_proxy)) run_command_and_log("echo 'Environment=\"https_proxy={0}\"' >> /etc/systemd/system/metrics-extension.service.d/proxy.conf".format(http_proxy)) os.system('chmod {1} {0}'.format("/etc/systemd/system/metrics-extension.service.d/proxy.conf", 400)) run_command_and_log("systemctl daemon-reload") run_command_and_log('systemctl restart azuremonitor-coreagent') run_command_and_log('systemctl restart metrics-extension') except: log_and_exit("enable", MissingorInvalidParameterErrorCode, "Failed to update /etc/systemd/system/azuremonitor-coreagent.service.d and /etc/systemd/system/metrics-extension.service.d" ) def unset_proxy(): """ # Unset proxy http_proxy env var in dependent services """ try: hasSettings=False # Update Coreagent if os.path.exists("/etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf"): os.remove("/etc/systemd/system/azuremonitor-coreagent.service.d/proxy.conf") hasSettings=True # Update ME if os.path.exists("/etc/systemd/system/metrics-extension.service.d/proxy.conf"): os.remove("/etc/systemd/system/metrics-extension.service.d/proxy.conf") hasSettings=True if hasSettings: run_command_and_log("systemctl daemon-reload") run_command_and_log('systemctl restart azuremonitor-coreagent') run_command_and_log('systemctl restart metrics-extension') except: log_and_exit("enable", MissingorInvalidParameterErrorCode, "Failed to remove /etc/systemd/system/azuremonitor-coreagent.service.d and /etc/systemd/system/metrics-extension.service.d" ) def get_managed_identity(): """ # Determine Managed Identity (MI) settings # Nomenclature: Managed System Identity (MSI), System-Assigned Identity (SAI), User-Assigned Identity (UAI) # Unspecified MI scenario: MSI returns SAI token if exists, otherwise returns UAI token if exactly one UAI exists, otherwise failure # Specified MI scenario: MSI returns token for specified MI # Returns identifier_name, identifier_value, and error message (if any) """ identifier_name = identifier_value = "" public_settings, _ = get_settings() if public_settings is not None and public_settings.get(AzureMonitorConfigKey): azure_monitor_configuration = public_settings.get(AzureMonitorConfigKey) if azure_monitor_configuration and azure_monitor_configuration.get("enable") == True: public_settings = azure_monitor_configuration.get("configuration") if public_settings is not None and "authentication" in public_settings and "managedIdentity" in public_settings.get("authentication"): managedIdentity = public_settings.get("authentication").get("managedIdentity") if "identifier-name" not in managedIdentity or "identifier-value" not in managedIdentity: return identifier_name, identifier_value, 'Parameters "identifier-name" and "identifier-value" are both required in authentication.managedIdentity public setting' identifier_name = managedIdentity.get("identifier-name") identifier_value = managedIdentity.get("identifier-value") if identifier_name not in ["client_id", "mi_res_id", "object_id"]: return identifier_name, identifier_value, 'Invalid identifier-name provided; must be "client_id" or "mi_res_id" or "object_id"' if not identifier_value: return identifier_name, identifier_value, 'Invalid identifier-value provided; cannot be empty' if identifier_name in ["object_id", "client_id"]: guid_re = re.compile(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}') if not guid_re.search(identifier_value): return identifier_name, identifier_value, 'Invalid identifier-value provided for {0}; must be a GUID'.format(identifier_name) return identifier_name, identifier_value, "" def stop_metrics_process(): if telhandler.is_running(is_lad=False): #Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=False) if tel_out: hutil_log_info(tel_msg) else: hutil_log_error(tel_msg) #Delete the telegraf and ME services tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service(is_lad=False) if tel_rm_out: hutil_log_info(tel_rm_msg) else: hutil_log_error(tel_rm_msg) if me_handler.is_running(is_lad=False): me_out, me_msg = me_handler.stop_metrics_service(is_lad=False) if me_out: hutil_log_info(me_msg) else: hutil_log_error(me_msg) me_rm_out, me_rm_msg = me_handler.remove_metrics_service(is_lad=False) if me_rm_out: hutil_log_info(me_rm_msg) else: hutil_log_error(me_rm_msg) pids_filepath = os.path.join(os.getcwd(),'amametrics.pid') # kill existing metrics watcher if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA metrics watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-metrics") >= 0: kill_cmd = "kill " + pid run_command_and_log(kill_cmd) run_command_and_log("rm "+pids_filepath) def stop_syslogconfig_process(): pids_filepath = os.path.join(os.getcwd(),'amasyslogconfig.pid') # kill existing syslog config watcher if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA syslog watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-syslogconfig") >= 0: kill_cmd = "kill " + pid run_command_and_log(kill_cmd) run_command_and_log("rm "+ pids_filepath) def is_metrics_process_running(): pids_filepath = os.path.join(os.getcwd(),'amametrics.pid') if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA metrics watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-metrics") >= 0: return True return False def is_syslogconfig_process_running(): pids_filepath = os.path.join(os.getcwd(),'amasyslogconfig.pid') if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA syslog watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-syslogconfig") >= 0: return True return False def is_transformconfig_process_running(): pids_filepath = os.path.join(os.getcwd(),'amatransformconfig.pid') if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA transform config watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-transformconfig") >= 0: return True return False def start_metrics_process(): """ Start metrics process that performs periodic monitoring activities :return: None """ # if metrics process is already running, it should manage lifecycle of telegraf, ME, # process to refresh ME MSI token and look for new config changes if counters change, etc, so this is no-op if not is_metrics_process_running(): stop_metrics_process() # Start metrics watcher ama_path = os.path.join(os.getcwd(), 'agent.py') args = [sys.executable, ama_path, '-metrics'] log = open(os.path.join(os.getcwd(), 'daemon.log'), 'w') hutil_log_info('start watcher process '+str(args)) subprocess.Popen(args, stdout=log, stderr=log) def start_syslogconfig_process(): """ Start syslog check process that performs periodic DCR monitoring activities and looks for syslog config changes :return: None """ # test if not is_syslogconfig_process_running(): stop_syslogconfig_process() # Start syslog config watcher ama_path = os.path.join(os.getcwd(), 'agent.py') args = [sys.executable, ama_path, '-syslogconfig'] log = open(os.path.join(os.getcwd(), 'daemon.log'), 'w') hutil_log_info('start syslog watcher process '+str(args)) subprocess.Popen(args, stdout=log, stderr=log) def start_transformconfig_process(): """ Start agent transform check process that performs periodic DCR monitoring activities and looks for agent transformation config changes :return: None """ # test if not is_transformconfig_process_running(): stop_transformconfig_process() # Start agent transform config watcher ama_path = os.path.join(os.getcwd(), 'agent.py') args = [sys.executable, ama_path, '-transformconfig'] log = open(os.path.join(os.getcwd(), 'daemon.log'), 'w') hutil_log_info('start agent transform config watcher process '+str(args)) subprocess.Popen(args, stdout=log, stderr=log) def stop_transformconfig_process(): pids_filepath = os.path.join(os.getcwd(),'amatransformconfig.pid') # kill existing agent transform config watcher if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to AMA tranform config watcher. cmd_file = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_file): with open(cmd_file, "r") as pidf: cmdline = pidf.readlines() if len(cmdline) > 0 and cmdline[0].find("agent.py") >= 0 and cmdline[0].find("-transformconfig") >= 0: kill_cmd = "kill " + pid run_command_and_log(kill_cmd) run_command_and_log("rm "+ pids_filepath) def metrics_watcher(hutil_error, hutil_log): """ Watcher thread to monitor metric configuration changes and to take action on them """ # Check every 30 seconds sleepTime = 30 # Retrieve managed identity info that may be needed for token retrieval identifier_name, identifier_value, error_msg = get_managed_identity() if error_msg: hutil_error('Failed to determine managed identity settings; MSI token retreival will rely on default identity, if any. {0}.'.format(error_msg)) if identifier_name and identifier_value: managed_identity_str = "uai#{0}#{1}".format(identifier_name, identifier_value) else: managed_identity_str = "sai" # Sleep before starting the monitoring time.sleep(sleepTime) last_crc = None last_crc_fluent = None me_msi_token_expiry_epoch = None while True: try: # update fluent config for fluent port if needed fluent_port = '' if os.path.isfile(AMAFluentPortFilePath): f = open(AMAFluentPortFilePath, "r") fluent_port = f.read() f.close() if fluent_port != '' and os.path.isfile(FluentCfgPath): portSetting = " Port " + fluent_port + "\n" defaultPortSetting = 'Port' portUpdated = True with open(FluentCfgPath, 'r') as f: for line in f: found = re.search(r'^\s{0,}Port\s{1,}' + fluent_port + '$', line) if found: portUpdated = False if portUpdated == True: with contextlib.closing(fileinput.FileInput(FluentCfgPath, inplace=True, backup='.bak')) as file: for line in file: if defaultPortSetting in line: print(portSetting, end='') else: print(line, end='') os.chmod(FluentCfgPath, stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) # add SELinux rules if needed if os.path.exists('/etc/selinux/config') and fluent_port != '': sedisabled, _ = run_command_and_log('getenforce | grep -i "Disabled"',log_cmd=False, log_output=False) if sedisabled != 0: check_semanage, _ = run_command_and_log("which semanage",log_cmd=False, log_output=False) if check_semanage == 0: fluentPortEnabled, _ = run_command_and_log('grep -Rnw /var/lib/selinux -e http_port_t | grep ' + fluent_port,log_cmd=False, log_output=False) if fluentPortEnabled != 0: # also check SELinux config paths for Oracle/RH fluentPortEnabled, _ = run_command_and_log('grep -Rnw /etc/selinux -e http_port_t | grep ' + fluent_port,log_cmd=False, log_output=False) if fluentPortEnabled != 0: # allow the fluent port in SELinux run_command_and_log('semanage port -a -t http_port_t -p tcp ' + fluent_port,log_cmd=False, log_output=False) if os.path.isfile(FluentCfgPath): f = open(FluentCfgPath, "r") data = f.read() if (data != ''): crc_fluent = hashlib.sha256(data.encode('utf-8')).hexdigest() if (crc_fluent != last_crc_fluent): restart_launcher() last_crc_fluent = crc_fluent if os.path.isfile(MdsdCounterJsonPath): f = open(MdsdCounterJsonPath, "r") data = f.read() if (data != ''): json_data = json.loads(data) if len(json_data) == 0: last_crc = hashlib.sha256(data.encode('utf-8')).hexdigest() if telhandler.is_running(is_lad=False): # Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=False) if tel_out: hutil_log(tel_msg) else: hutil_error(tel_msg) # Delete the telegraf and ME services tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service(is_lad=False) if tel_rm_out: hutil_log(tel_rm_msg) else: hutil_error(tel_rm_msg) if me_handler.is_running(is_lad=False): me_out, me_msg = me_handler.stop_metrics_service(is_lad=False) if me_out: hutil_log(me_msg) else: hutil_error(me_msg) me_rm_out, me_rm_msg = me_handler.remove_metrics_service(is_lad=False) if me_rm_out: hutil_log(me_rm_msg) else: hutil_error(me_rm_msg) else: crc = hashlib.sha256(data.encode('utf-8')).hexdigest() if(crc != last_crc): # Resetting the me_msi_token_expiry_epoch variable if we set up ME again. me_msi_token_expiry_epoch = None hutil_log("Start processing metric configuration") hutil_log(data) telegraf_config, telegraf_namespaces = telhandler.handle_config( json_data, "unix:///run/azuremonitoragent/mdm_influxdb.socket", "unix:///run/azuremonitoragent/default_influx.socket", is_lad=False) me_service_template_path = os.getcwd() + "/services/metrics-extension.service" if os.path.exists(me_service_template_path): os.remove(me_service_template_path) if is_feature_enabled("enableCMV2"): copyfile(os.getcwd() + "/services/metrics-extension-otlp.service", me_service_template_path) else: copyfile(os.getcwd() + "/services/metrics-extension-cmv1.service", me_service_template_path) me_handler.setup_me(is_lad=False, managed_identity=managed_identity_str, HUtilObj=HUtilObject) start_telegraf_res, log_messages = telhandler.start_telegraf(is_lad=False) if start_telegraf_res: hutil_log("Successfully started metrics-sourcer.") else: hutil_error(log_messages) start_metrics_out, log_messages = me_handler.start_metrics(is_lad=False, managed_identity=managed_identity_str) if start_metrics_out: hutil_log("Successfully started metrics-extension.") else: hutil_error(log_messages) last_crc = crc generate_token = False me_token_path = os.path.join(os.getcwd(), "/config/metrics_configs/AuthToken-MSI.json") if me_msi_token_expiry_epoch is None or me_msi_token_expiry_epoch == "": if os.path.isfile(me_token_path): with open(me_token_path, "r") as f: authtoken_content = f.read() if authtoken_content and "expires_on" in authtoken_content: me_msi_token_expiry_epoch = authtoken_content["expires_on"] else: generate_token = True else: generate_token = True if me_msi_token_expiry_epoch: currentTime = datetime.datetime.now() token_expiry_time = datetime.datetime.fromtimestamp(int(me_msi_token_expiry_epoch)) if token_expiry_time - currentTime < datetime.timedelta(minutes=30): # The MSI Token will expire within 30 minutes. We need to refresh the token generate_token = True if generate_token: generate_token = False msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token(identifier_name, identifier_value, is_lad=False) if msi_token_generated: hutil_log("Successfully refreshed metrics-extension MSI Auth token.") else: hutil_error(log_messages) telegraf_restart_retries = 0 me_restart_retries = 0 max_restart_retries = 10 # Check if telegraf is running, if not, then restart if not telhandler.is_running(is_lad=False): if telegraf_restart_retries < max_restart_retries: telegraf_restart_retries += 1 hutil_log("Telegraf binary process is not running. Restarting telegraf now. Retry count - {0}".format(telegraf_restart_retries)) tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=False) if tel_out: hutil_log(tel_msg) else: hutil_error(tel_msg) start_telegraf_res, log_messages = telhandler.start_telegraf(is_lad=False) if start_telegraf_res: hutil_log("Successfully started metrics-sourcer.") else: hutil_error(log_messages) else: hutil_error("Telegraf binary process is not running. Failed to restart after {0} retries. Please check telegraf.log".format(max_restart_retries)) else: telegraf_restart_retries = 0 # Check if ME is running, if not, then restart if not me_handler.is_running(is_lad=False): if me_restart_retries < max_restart_retries: me_restart_retries += 1 hutil_log("MetricsExtension binary process is not running. Restarting MetricsExtension now. Retry count - {0}".format(me_restart_retries)) me_out, me_msg = me_handler.stop_metrics_service(is_lad=False) if me_out: hutil_log(me_msg) else: hutil_error(me_msg) start_metrics_out, log_messages = me_handler.start_metrics(is_lad=False, managed_identity=managed_identity_str) if start_metrics_out: hutil_log("Successfully started metrics-extension.") else: hutil_error(log_messages) else: hutil_error("MetricsExtension binary process is not running. Failed to restart after {0} retries. Please check /var/log/syslog for ME logs".format(max_restart_retries)) else: me_restart_retries = 0 except IOError as e: hutil_error('I/O error in setting up or monitoring metrics. Exception={0}'.format(e)) except Exception as e: hutil_error('Error in setting up or monitoring metrics. Exception={0}'.format(e)) finally: time.sleep(sleepTime) def syslogconfig_watcher(hutil_error, hutil_log): """ Watcher thread to monitor syslog configuration changes and to take action on them """ syslog_enabled = False # Check for config changes every 30 seconds sleepTime = 30 # Sleep before starting the monitoring time.sleep(sleepTime) GcsEnabled, McsEnabled = get_control_plane_mode() while True: try: if os.path.isfile(AMASyslogConfigMarkerPath): f = open(AMASyslogConfigMarkerPath, "r") data = f.read() if (data != ''): if "true" in data: syslog_enabled = True f.close() elif GcsEnabled: # 1P Syslog is always enabled as each tenant could be having different mdsd.xml configuration syslog_enabled = True if syslog_enabled: # place syslog local configs syslog_enabled = False generate_localsyslog_configs(uses_gcs=GcsEnabled, uses_mcs=McsEnabled) else: # remove syslog local configs remove_localsyslog_configs() except IOError as e: hutil_error('I/O error in setting up syslog config watcher. Exception={0}'.format(e)) except Exception as e: hutil_error('Error in setting up syslog config watcher. Exception={0}'.format(e)) finally: time.sleep(sleepTime) def transformconfig_watcher(hutil_error, hutil_log): """ Watcher thread to monitor agent transformation configuration changes and to take action on them """ # Check for config changes every 30 seconds sleepTime = 30 # Sleep before starting the monitoring time.sleep(sleepTime) last_crc = None while True: try: if os.path.isfile(AMAAstTransformConfigMarkerPath): f = open(AMAAstTransformConfigMarkerPath, "r") data = f.read() if (data != ''): crc = hashlib.sha256(data.encode('utf-8')).hexdigest() if (crc != last_crc): restart_kqlextension() last_crc = crc f.close() except IOError as e: hutil_error('I/O error in setting up agent transform config watcher. Exception={0}'.format(e)) except Exception as e: hutil_error('Error in setting up agent transform config watcher. Exception={0}'.format(e)) finally: time.sleep(sleepTime) def generate_localsyslog_configs(uses_gcs = False, uses_mcs = False): """ Install local syslog configuration files if not present and restart syslog """ # don't deploy any configuration if no control plane is configured if not uses_gcs and not uses_mcs: return public_settings, _ = get_settings() syslog_port = '' if os.path.isfile(AMASyslogPortFilePath): f = open(AMASyslogPortFilePath, "r") syslog_port = f.read() f.close() useSyslogTcp = False # always use syslog tcp port, unless # - the distro is Red Hat based and doesn't have semanage # these distros seem to have SELinux on by default and we shouldn't be installing semanage ourselves if not os.path.exists('/etc/selinux/config'): useSyslogTcp = True else: sedisabled, _ = run_command_and_log('getenforce | grep -i "Disabled"',log_cmd=False, log_output=False) if sedisabled == 0: useSyslogTcp = True else: check_semanage, _ = run_command_and_log("which semanage",log_cmd=False, log_output=False) if check_semanage == 0 and syslog_port != '': syslogPortEnabled, _ = run_command_and_log('grep -Rnw /var/lib/selinux -e syslogd_port_t | grep ' + syslog_port,log_cmd=False, log_output=False) if syslogPortEnabled != 0: # also check SELinux config paths for Oracle/RH syslogPortEnabled, _ = run_command_and_log('grep -Rnw /etc/selinux -e syslogd_port_t | grep ' + syslog_port,log_cmd=False, log_output=False) if syslogPortEnabled != 0: # allow the syslog port in SELinux run_command_and_log('semanage port -a -t syslogd_port_t -p tcp ' + syslog_port,log_cmd=False, log_output=False) useSyslogTcp = True # 1P tenants use omuxsock, so keep using that for customers using 1P if useSyslogTcp == True and syslog_port != '': if os.path.exists('/etc/rsyslog.d/'): restartRequired = False if uses_gcs and not os.path.exists('/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf'): copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/rsyslogconf/05-azuremonitoragent-loadomuxsock.conf","/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf") restartRequired = True if not os.path.exists('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf'): if os.path.exists('/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf'): os.remove("/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf") if os.path.exists('/etc/rsyslog.d/10-azuremonitoragent.conf'): os.remove("/etc/rsyslog.d/10-azuremonitoragent.conf") copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/rsyslogconf/10-azuremonitoragent-omfwd.conf","/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf") os.chmod('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) restartRequired = True portSetting = 'Port="' + syslog_port + '"' defaultPortSetting = 'Port="28330"' portUpdated = False with open('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf') as f: if portSetting not in f.read(): portUpdated = True if portUpdated == True: copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/rsyslogconf/10-azuremonitoragent-omfwd.conf","/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf") with contextlib.closing(fileinput.FileInput('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf', inplace=True, backup='.bak')) as file: for line in file: print(line.replace(defaultPortSetting, portSetting), end='') os.chmod('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) restartRequired = True if restartRequired == True: run_command_and_log(get_service_command("rsyslog", "restart")) hutil_log_info("Installed local syslog configuration files and restarted syslog") if os.path.exists('/etc/syslog-ng/syslog-ng.conf'): restartRequired = False if not os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf'): if os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent.conf'): os.remove("/etc/syslog-ng/conf.d/azuremonitoragent.conf") syslog_ng_confpath = os.path.join('/etc/syslog-ng/', 'conf.d') if not os.path.exists(syslog_ng_confpath): os.makedirs(syslog_ng_confpath) copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/syslog-ngconf/azuremonitoragent-tcp.conf","/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf") os.chmod('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) restartRequired = True portSetting = "port(" + syslog_port + ")" defaultPortSetting = "port(28330)" portUpdated = False with open('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf') as f: if portSetting not in f.read(): portUpdated = True if portUpdated == True: copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/syslog-ngconf/azuremonitoragent-tcp.conf","/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf") with contextlib.closing(fileinput.FileInput('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf', inplace=True, backup='.bak')) as file: for line in file: print(line.replace(defaultPortSetting, portSetting), end='') os.chmod('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) restartRequired = True if restartRequired == True: run_command_and_log(get_service_command("syslog-ng", "restart")) hutil_log_info("Installed local syslog configuration files and restarted syslog") else: if os.path.exists('/etc/rsyslog.d/') and not os.path.exists('/etc/rsyslog.d/10-azuremonitoragent.conf'): if os.path.exists('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf'): os.remove("/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf") copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/rsyslogconf/05-azuremonitoragent-loadomuxsock.conf","/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf") copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/rsyslogconf/10-azuremonitoragent.conf","/etc/rsyslog.d/10-azuremonitoragent.conf") os.chmod('/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) os.chmod('/etc/rsyslog.d/10-azuremonitoragent.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) run_command_and_log(get_service_command("rsyslog", "restart")) hutil_log_info("Installed local syslog configuration files and restarted syslog") if os.path.exists('/etc/syslog-ng/syslog-ng.conf') and not os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent.conf'): if os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf'): os.remove("/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf") syslog_ng_confpath = os.path.join('/etc/syslog-ng/', 'conf.d') if not os.path.exists(syslog_ng_confpath): os.makedirs(syslog_ng_confpath) copyfile("/etc/opt/microsoft/azuremonitoragent/syslog/syslog-ngconf/azuremonitoragent.conf","/etc/syslog-ng/conf.d/azuremonitoragent.conf") os.chmod('/etc/syslog-ng/conf.d/azuremonitoragent.conf', stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH) run_command_and_log(get_service_command("syslog-ng", "restart")) hutil_log_info("Installed local syslog configuration files and restarted syslog") def remove_localsyslog_configs(): """ Remove local syslog configuration files if present and restart syslog """ if os.path.exists('/etc/rsyslog.d/10-azuremonitoragent.conf') or os.path.exists('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf'): if os.path.exists('/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf'): os.remove("/etc/rsyslog.d/10-azuremonitoragent-omfwd.conf") if os.path.exists('/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf'): os.remove("/etc/rsyslog.d/05-azuremonitoragent-loadomuxsock.conf") if os.path.exists('/etc/rsyslog.d/10-azuremonitoragent.conf'): os.remove("/etc/rsyslog.d/10-azuremonitoragent.conf") run_command_and_log(get_service_command("rsyslog", "restart")) hutil_log_info("Removed local syslog configuration files if found and restarted syslog") if os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent.conf') or os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf'): if os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf'): os.remove("/etc/syslog-ng/conf.d/azuremonitoragent-tcp.conf") if os.path.exists('/etc/syslog-ng/conf.d/azuremonitoragent.conf'): os.remove("/etc/syslog-ng/conf.d/azuremonitoragent.conf") run_command_and_log(get_service_command("syslog-ng", "restart")) hutil_log_info("Removed local syslog configuration files if found and restarted syslog") def metrics(): """ Take care of setting up telegraf and ME for metrics if configuration is present """ pids_filepath = os.path.join(os.getcwd(), 'amametrics.pid') py_pid = os.getpid() with open(pids_filepath, 'w') as f: f.write(str(py_pid) + '\n') watcher_thread = Thread(target = metrics_watcher, args = [hutil_log_error, hutil_log_info]) watcher_thread.start() watcher_thread.join() return 0, "" def syslogconfig(): """ Take care of setting up syslog configuration change watcher """ pids_filepath = os.path.join(os.getcwd(), 'amasyslogconfig.pid') py_pid = os.getpid() with open(pids_filepath, 'w') as f: f.write(str(py_pid) + '\n') watcher_thread = Thread(target = syslogconfig_watcher, args = [hutil_log_error, hutil_log_info]) watcher_thread.start() watcher_thread.join() return 0, "" def transformconfig(): """ Take care of setting up agent transformation configuration change watcher """ pids_filepath = os.path.join(os.getcwd(), 'amatransformconfig.pid') py_pid = os.getpid() with open(pids_filepath, 'w') as f: f.write(str(py_pid) + '\n') watcher_thread = Thread(target = transformconfig_watcher, args = [hutil_log_error, hutil_log_info]) watcher_thread.start() watcher_thread.join() return 0, "" # Dictionary of operations strings to methods operations = {'Disable' : disable, 'Uninstall' : uninstall, 'Install' : install, 'Enable' : enable, 'Update' : update, 'Metrics' : metrics, 'Syslogconfig' : syslogconfig, 'Transformconfig' : transformconfig } def parse_context(operation): """ Initialize a HandlerUtil object for this operation. If the required modules have not been imported, this will return None. """ hutil = None if ('Utils.WAAgentUtil' in sys.modules and 'Utils.HandlerUtil' in sys.modules): try: logFileName = 'extension.log' hutil = HUtil.HandlerUtility(waagent.Log, waagent.Error, logFileName=logFileName) hutil.do_parse_context(operation) # As per VM extension team, we have to manage rotation for our extension.log # for now, this is our extension code, but to be moved to HUtil library. if not os.path.exists(AMAExtensionLogRotateFilePath): logrotateFilePath = os.path.join(os.getcwd(), 'azuremonitoragentextension.logrotate') copyfile(logrotateFilePath,AMAExtensionLogRotateFilePath) # parse_context may throw KeyError if necessary JSON key is not # present in settings except KeyError as e: waagent_log_error('Unable to parse context with error: ' \ '{0}'.format(e)) raise ParameterMissingException return hutil def set_os_arch(operation): """ Checks if the current system architecture is present in the SupportedArch set and replaces the package names accordingly """ global BundleFileName, SupportedArch current_arch = platform.machine() if current_arch in SupportedArch: # Replace the AMA package name according to architecture BundleFileName = BundleFileName.replace('x86_64', current_arch) # Rename the Arch appropriate metrics extension binary to MetricsExtension MetricsExtensionDir = os.path.join(os.getcwd(), 'MetricsExtensionBin') SupportedMEPath = os.path.join(MetricsExtensionDir, 'MetricsExtension_'+current_arch) vm_dist, vm_ver = find_vm_distro(operation) if current_arch == 'aarch64' and vm_dist.startswith('centos') and vm_ver.startswith('7'): SupportedMEPath += '_centos7' if os.path.exists(SupportedMEPath): os.rename(SupportedMEPath, os.path.join(MetricsExtensionDir, 'MetricsExtension')) # Cleanup unused ME binaries for f in os.listdir(MetricsExtensionDir): if f != 'MetricsExtension': os.remove(os.path.join(MetricsExtensionDir, f)) def find_package_manager(operation): """ Checks if the dist is debian based or centos based and assigns the package manager accordingly """ global PackageManager, PackageManagerOptions, BundleFileName dist, _ = find_vm_distro(operation) dpkg_set = set(["debian", "ubuntu"]) rpm_set = set(["oracle", "ol", "redhat", "centos", "red hat", "suse", "sles", "opensuse", "cbl-mariner", "mariner", "azurelinux", "rhel", "rocky", "alma", "amzn"]) for dpkg_dist in dpkg_set: if dist.startswith(dpkg_dist): PackageManager = "dpkg" # OK to replace the /etc/default/azuremonitoragent, since the placeholders gets replaced again. # Otherwise, the package manager prompts for action (Y/I/N/O/D/Z) [default=N] PackageManagerOptions = "--force-overwrite --force-confnew" BundleFileName = BundleFileNameDeb break for rpm_dist in rpm_set: if dist.startswith(rpm_dist): PackageManager = "rpm" # Same as above. PackageManagerOptions = "--force" BundleFileName = BundleFileNameRpm break if PackageManager == "": log_and_exit(operation, UnsupportedOperatingSystem, "The OS has neither rpm nor dpkg" ) def find_vm_distro(operation): """ Finds the Linux Distribution this vm is running on. """ vm_dist = vm_id = vm_ver = None parse_manually = False # platform commands used below aren't available after Python 3.6 if sys.version_info < (3,7): try: vm_dist, vm_ver, vm_id = platform.linux_distribution() except AttributeError: try: vm_dist, vm_ver, vm_id = platform.dist() except AttributeError: hutil_log_info("Falling back to /etc/os-release distribution parsing") # Some python versions *IF BUILT LOCALLY* (ex 3.5) give string responses (ex. 'bullseye/sid') to platform.dist() function # This causes exception in the method below. Thus adding a check to switch to manual parsing in this case try: temp_vm_ver = int(vm_ver.split('.')[0]) except: parse_manually = True else: parse_manually = True if (not vm_dist and not vm_ver) or parse_manually: # SLES 15 and others try: with open('/etc/os-release', 'r') as fp: for line in fp: if line.startswith('ID='): vm_dist = line.split('=')[1] vm_dist = vm_dist.split('-')[0] vm_dist = vm_dist.replace('\"', '').replace('\n', '') elif line.startswith('VERSION_ID='): vm_ver = line.split('=')[1] vm_ver = vm_ver.replace('\"', '').replace('\n', '') except: log_and_exit(operation, IndeterminateOperatingSystem, 'Indeterminate operating system') # initialize them to empty string so that .lower() is valid in case we weren't able to retrieve it # downstream callers expect a string and not NoneType if not vm_dist: vm_dist = "" if not vm_ver: vm_ver = "" return vm_dist.lower(), vm_ver.lower() def is_vm_supported_for_extension(operation): """ Checks if the VM this extension is running on is supported by AzureMonitorAgent Returns for platform.linux_distribution() vary widely in format, such as '7.3.1611' returned for a VM with CentOS 7, so the first provided digits must match The supported distros of the AzureMonitorLinuxAgent are allowed to utilize this VM extension. All other distros will get error code 51 """ if platform.machine() == 'aarch64': supported_dists = supported_distros.supported_dists_aarch64 else: supported_dists = supported_distros.supported_dists_x86_64 vm_supported = False vm_dist, vm_ver = find_vm_distro(operation) # Find this VM distribution in the supported list for supported_dist in list(supported_dists.keys()): if not vm_dist.startswith(supported_dist): continue # Check if this VM distribution version is supported vm_ver_split = vm_ver.split('.') for supported_ver in supported_dists[supported_dist]: supported_ver_split = supported_ver.split('.') # If vm_ver is at least as precise (at least as many digits) as # supported_ver and matches all the supported_ver digits, then # this VM is guaranteed to be supported vm_ver_match = True for idx, supported_ver_num in enumerate(supported_ver_split): try: supported_ver_num = int(supported_ver_num) vm_ver_num = int(vm_ver_split[idx]) except IndexError: vm_ver_match = False break if vm_ver_num != supported_ver_num: vm_ver_match = False break if vm_ver_match: vm_supported = True break if vm_supported: break return vm_supported, vm_dist, vm_ver def exit_if_vm_not_supported(operation): """ Check if this VM distro and version are supported by the AzureMonitorLinuxAgent. If VM is supported, find the package manager present in this distro If this VM is not supported, log the proper error code and exit. """ vm_supported, vm_dist, vm_ver = is_vm_supported_for_extension(operation) if not vm_supported: log_and_exit(operation, UnsupportedOperatingSystem, 'Unsupported operating system: ' \ '{0} {1}'.format(vm_dist, vm_ver)) return 0 def is_feature_enabled(feature): """ Checks if the feature is enabled in the current region """ feature_support_matrix = { 'useDynamicSSL' : ['all'], 'enableCMV2' : ['eastus2euap', 'northcentralus'] } featurePreviewFlagPath = PreviewFeaturesDirectory + feature if os.path.exists(featurePreviewFlagPath): return True featurePreviewDisabledFlagPath = PreviewFeaturesDirectory + feature + 'Disabled' if os.path.exists(featurePreviewDisabledFlagPath): return False _, region = get_azure_environment_and_region() if feature in feature_support_matrix.keys(): if region in feature_support_matrix[feature] or "all" in feature_support_matrix[feature]: return True return False def get_ssl_cert_info(operation): """ Get the appropriate SSL_CERT_DIR / SSL_CERT_FILE based on the Linux distro """ name = value = None distro, version = find_vm_distro(operation) for name in ['ubuntu', 'debian']: if distro.startswith(name): return 'SSL_CERT_DIR', '/etc/ssl/certs' for name in ['centos', 'redhat', 'red hat', 'oracle', 'ol', 'cbl-mariner', 'mariner', 'azurelinux', 'rhel', 'rocky', 'alma', 'amzn']: if distro.startswith(name): return 'SSL_CERT_FILE', '/etc/pki/tls/certs/ca-bundle.crt' for name in ['suse', 'sles', 'opensuse']: if distro.startswith(name): if version.startswith('12'): return 'SSL_CERT_DIR', '/var/lib/ca-certificates/openssl' elif version.startswith('15'): return 'SSL_CERT_DIR', '/etc/ssl/certs' log_and_exit(operation, GenericErrorCode, 'Unable to determine values for SSL_CERT_DIR or SSL_CERT_FILE') def copy_kqlextension_binaries(): kqlextension_bin_local_path = os.getcwd() + "/KqlExtensionBin/" kqlextension_bin = "/opt/microsoft/azuremonitoragent/bin/kqlextension/" kqlextension_runtimesbin = "/opt/microsoft/azuremonitoragent/bin/kqlextension/runtimes/" if os.path.exists(kqlextension_runtimesbin): # only for versions of AMA with .NET runtimes rmtree(kqlextension_runtimesbin) # only for versions with Kql .net cleanup .NET files as it is causing issues with AOT runtime for f in os.listdir(kqlextension_bin): if f != 'KqlExtension' and f != 'appsettings.json': os.remove(os.path.join(kqlextension_bin, f)) for f in os.listdir(kqlextension_bin_local_path): compare_and_copy_bin(kqlextension_bin_local_path + f, kqlextension_bin + f) def is_arc_installed(): """ Check if this is an Arc machine """ # Using systemctl to check this since Arc only supports VMs that have systemd check_arc = os.system('systemctl status himdsd 1>/dev/null 2>&1') return check_arc == 0 def get_arc_endpoint(): """ Find the endpoint for Arc IMDS """ endpoint_filepath = '/lib/systemd/system.conf.d/azcmagent.conf' endpoint = '' try: with open(endpoint_filepath, 'r') as f: data = f.read() endpoint = data.split("\"IMDS_ENDPOINT=")[1].split("\"\n")[0] except: hutil_log_error('Unable to load Arc IMDS endpoint from {0}'.format(endpoint_filepath)) return endpoint def get_imds_endpoint(): """ Find the appropriate endpoint (Azure or Arc) for IMDS """ azure_imds_endpoint = 'http://169.254.169.254/metadata/instance?api-version=2018-10-01' if (is_arc_installed()): hutil_log_info('Arc is installed, loading Arc-specific IMDS endpoint') imds_endpoint = get_arc_endpoint() if imds_endpoint: imds_endpoint += '/metadata/instance?api-version=2019-08-15' else: # Fall back to the traditional IMDS endpoint; the cloud domain and VM # resource id detection logic are resilient to failed queries to IMDS imds_endpoint = azure_imds_endpoint hutil_log_info('Falling back to default Azure IMDS endpoint') else: imds_endpoint = azure_imds_endpoint hutil_log_info('Using IMDS endpoint "{0}"'.format(imds_endpoint)) return imds_endpoint def get_azure_environment_and_region(): """ Retreive the Azure environment and region from Azure or Arc IMDS """ imds_endpoint = get_imds_endpoint() req = urllib.Request(imds_endpoint) req.add_header('Metadata', 'True') environment = region = None try: response = json.loads(urllib.urlopen(req).read().decode('utf-8', 'ignore')) if ('compute' in response): if ('azEnvironment' in response['compute']): environment = response['compute']['azEnvironment'].lower() if ('location' in response['compute']): region = response['compute']['location'].lower() except urlerror.HTTPError as e: hutil_log_error('Request to Metadata service URL failed with an HTTPError: {0}'.format(e)) hutil_log_error('Response from Metadata service: {0}'.format(e.read())) except Exception as e: hutil_log_error('Unexpected error from Metadata service: {0}'.format(e)) hutil_log_info('Detected environment: {0}, region: {1}'.format(environment, region)) return environment, region def run_command_and_log(cmd, check_error = True, log_cmd = True, log_output = True): """ Run the provided shell command and log its output, including stdout and stderr. The output should not contain any PII, but the command might. In this case, log_cmd should be set to False. """ exit_code, output = run_get_output(cmd, check_error, log_cmd) if log_cmd: hutil_log_info('Output of command "{0}": \n{1}'.format(cmd.rstrip(), output)) elif log_output: hutil_log_info('Output: \n{0}'.format(output)) if "cannot open Packages database" in output: # Install failures # External issue. Package manager db is either corrupt or needs cleanup # https://github.com/Azure/azure-marketplace/wiki/Extension-Build-Notes-Best-Practices#error-codes-and-messages-output-to-stderr exit_code = MissingDependency output += "Package manager database is in a bad state. Please recover package manager, db cache and try install again later." elif "Permission denied" in output: # Enable failures # https://github.com/Azure/azure-marketplace/wiki/Extension-Build-Notes-Best-Practices#error-codes-and-messages-output-to-stderr exit_code = MissingDependency return exit_code, output def run_command_with_retries_output(cmd, retries, retry_check, final_check = None, check_error = True, log_cmd = True, initial_sleep_time = InitialRetrySleepSeconds, sleep_increase_factor = 1): """ Caller provides a method, retry_check, to use to determine if a retry should be performed. This must be a function with two parameters: exit_code and output The final_check can be provided as a method to perform a final check after retries have been exhausted Logic used: will retry up to retries times with initial_sleep_time in between tries If the retry_check retuns True for retry_verbosely, we will try cmd with the standard -v verbose flag added """ try_count = 0 sleep_time = initial_sleep_time run_cmd = cmd run_verbosely = False while try_count <= retries: if run_verbosely: run_cmd = cmd + ' -v' exit_code, output = run_command_and_log(run_cmd, check_error, log_cmd) should_retry, retry_message, run_verbosely = retry_check(exit_code, output) if not should_retry: break try_count += 1 hutil_log_info(retry_message) time.sleep(sleep_time) sleep_time *= sleep_increase_factor if final_check is not None: exit_code = final_check(exit_code, output) return exit_code, output def is_dpkg_or_rpm_locked(exit_code, output): """ If dpkg is locked, the output will contain a message similar to 'dpkg status database is locked by another process' """ if exit_code != 0: dpkg_locked_search = r'^.*dpkg.+lock.*$' dpkg_locked_re = re.compile(dpkg_locked_search, re.M) if dpkg_locked_re.search(output): return True rpm_locked_search = r'^.*rpm.+lock.*$' rpm_locked_re = re.compile(rpm_locked_search, re.M) if rpm_locked_re.search(output): return True return False def retry_if_dpkg_or_rpm_locked(exit_code, output): """ Some commands fail because the package manager is locked (apt-get/dpkg only); this will allow retries on failing commands. """ retry_verbosely = False dpkg_or_rpm_locked = is_dpkg_or_rpm_locked(exit_code, output) if dpkg_or_rpm_locked: return True, 'Retrying command because package manager is locked.', \ retry_verbosely else: return False, '', False def final_check_if_dpkg_or_rpm_locked(exit_code, output): """ If dpkg or rpm is still locked after the retries, we want to return a specific error code """ dpkg_or_rpm_locked = is_dpkg_or_rpm_locked(exit_code, output) if dpkg_or_rpm_locked: exit_code = DPKGOrRPMLockedErrorCode return exit_code def get_settings(): """ Retrieve the configuration for this extension operation """ global SettingsDict public_settings = None protected_settings = None if HUtilObject is not None: public_settings = HUtilObject.get_public_settings() protected_settings = HUtilObject.get_protected_settings() elif SettingsDict is not None: public_settings = SettingsDict['public_settings'] protected_settings = SettingsDict['protected_settings'] else: SettingsDict = {} handler_env = get_handler_env() try: config_dir = str(handler_env['handlerEnvironment']['configFolder']) except: config_dir = os.path.join(os.getcwd(), 'config') seq_no = get_latest_seq_no() settings_path = os.path.join(config_dir, '{0}.settings'.format(seq_no)) try: with open(settings_path, 'r') as settings_file: settings_txt = settings_file.read() settings = json.loads(settings_txt) h_settings = settings['runtimeSettings'][0]['handlerSettings'] public_settings = h_settings['publicSettings'] SettingsDict['public_settings'] = public_settings except: hutil_log_error('Unable to load handler settings from ' \ '{0}'.format(settings_path)) if ('protectedSettings' in h_settings and 'protectedSettingsCertThumbprint' in h_settings and h_settings['protectedSettings'] is not None and h_settings['protectedSettingsCertThumbprint'] is not None): encoded_settings = h_settings['protectedSettings'] settings_thumbprint = h_settings['protectedSettingsCertThumbprint'] encoded_cert_path = os.path.join('/var/lib/waagent', '{0}.crt'.format( settings_thumbprint)) encoded_key_path = os.path.join('/var/lib/waagent', '{0}.prv'.format( settings_thumbprint)) decoded_settings = base64.standard_b64decode(encoded_settings) decrypt_cmd = 'openssl smime -inform DER -decrypt -recip {0} ' \ '-inkey {1}'.format(encoded_cert_path, encoded_key_path) try: session = subprocess.Popen([decrypt_cmd], shell = True, stdin = subprocess.PIPE, stderr = subprocess.STDOUT, stdout = subprocess.PIPE) output = session.communicate(decoded_settings) except OSError: pass protected_settings_str = output[0] if protected_settings_str is None: log_and_exit('Enable', GenericErrorCode, 'Failed decrypting protectedSettings') protected_settings = '' try: protected_settings = json.loads(protected_settings_str) except: hutil_log_error('JSON exception decoding protected settings') SettingsDict['protected_settings'] = protected_settings return public_settings, protected_settings def update_status_file(operation, exit_code, exit_status, message): """ Mimic HandlerUtil method do_status_report in case hutil method is not available Write status to status file """ handler_env = get_handler_env() try: extension_version = str(handler_env['version']) status_dir = str(handler_env['handlerEnvironment']['statusFolder']) except: extension_version = "1.0" status_dir = os.path.join(os.getcwd(), 'status') status_txt = [{ "version" : extension_version, "timestampUTC" : time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "status" : { "name" : "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent", "operation" : operation, "status" : exit_status, "code" : exit_code, "formattedMessage" : { "lang" : "en-US", "message" : message } } }] status_json = json.dumps(status_txt) # Find the most recently changed config file and then use the # corresponding status file latest_seq_no = get_latest_seq_no() status_path = os.path.join(status_dir, '{0}.status'.format(latest_seq_no)) status_tmp = '{0}.tmp'.format(status_path) with open(status_tmp, 'w+') as tmp_file: tmp_file.write(status_json) os.rename(status_tmp, status_path) def get_handler_env(): """ Set and retrieve the contents of HandlerEnvironment.json as JSON """ global HandlerEnvironment if HandlerEnvironment is None: handler_env_path = os.path.join(os.getcwd(), 'HandlerEnvironment.json') try: with open(handler_env_path, 'r') as handler_env_file: handler_env_txt = handler_env_file.read() handler_env = json.loads(handler_env_txt) if type(handler_env) == list: handler_env = handler_env[0] HandlerEnvironment = handler_env except Exception as e: waagent_log_error(str(e)) return HandlerEnvironment def get_latest_seq_no(): """ Determine the latest operation settings number to use """ global SettingsSequenceNumber if SettingsSequenceNumber is None: handler_env = get_handler_env() try: config_dir = str(handler_env['handlerEnvironment']['configFolder']) except: config_dir = os.path.join(os.getcwd(), 'config') latest_seq_no = -1 cur_seq_no = -1 latest_time = None try: for dir_name, sub_dirs, file_names in os.walk(config_dir): for file_name in file_names: file_basename = os.path.basename(file_name) match = re.match(r'[0-9]{1,10}\.settings', file_basename) if match is None: continue cur_seq_no = int(file_basename.split('.')[0]) file_path = os.path.join(config_dir, file_name) cur_time = os.path.getmtime(file_path) if latest_time is None or cur_time > latest_time: latest_time = cur_time latest_seq_no = cur_seq_no except: pass if latest_seq_no < 0: latest_seq_no = 0 SettingsSequenceNumber = latest_seq_no return SettingsSequenceNumber def run_get_output(cmd, chk_err = False, log_cmd = True): """ Mimic waagent mothod RunGetOutput in case waagent is not available Run shell command and return exit code and output """ if 'Utils.WAAgentUtil' in sys.modules: # WALinuxAgent-2.0.14 allows only 2 parameters for RunGetOutput # If checking the number of parameters fails, pass 2 try: sig = inspect.signature(waagent.RunGetOutput) params = sig.parameters waagent_params = len(params) except: try: spec = inspect.getargspec(waagent.RunGetOutput) params = spec.args waagent_params = len(params) except: waagent_params = 2 if waagent_params >= 3: exit_code, output = waagent.RunGetOutput(cmd, chk_err, log_cmd) else: exit_code, output = waagent.RunGetOutput(cmd, chk_err) else: try: output = subprocess.check_output(cmd, stderr = subprocess.STDOUT, shell = True) exit_code = 0 except subprocess.CalledProcessError as e: exit_code = e.returncode output = e.output try: unicode_type = unicode # Python 2 except NameError: unicode_type = str # Python 3 if all(ord(c) < 128 for c in output) or isinstance(output, unicode_type): output = output.encode('utf-8') # On python 3, encode returns a byte object, so we must decode back to a string if sys.version_info >= (3,) and type(output) == bytes: output = output.decode('utf-8', 'ignore') return exit_code, output.strip() def init_waagent_logger(): """ Initialize waagent logger If waagent has not been imported, catch the exception """ try: waagent.LoggerInit('/var/log/waagent.log', '/dev/stdout', True) except Exception as e: print('Unable to initialize waagent log because of exception ' \ '{0}'.format(e)) def waagent_log_info(message): """ Log informational message, being cautious of possibility that waagent may not be imported """ if 'Utils.WAAgentUtil' in sys.modules: waagent.Log(message) else: print('Info: {0}'.format(message)) def waagent_log_error(message): """ Log error message, being cautious of possibility that waagent may not be imported """ if 'Utils.WAAgentUtil' in sys.modules: waagent.Error(message) else: print('Error: {0}'.format(message)) def hutil_log_info(message): """ Log informational message, being cautious of possibility that hutil may not be imported and configured """ if HUtilObject is not None: HUtilObject.log(message) else: print('Info: {0}'.format(message)) def hutil_log_error(message): """ Log error message, being cautious of possibility that hutil may not be imported and configured """ if HUtilObject is not None: HUtilObject.error(message) else: print('Error: {0}'.format(message)) def log_and_exit(operation, exit_code = GenericErrorCode, message = ''): """ Log the exit message and perform the exit """ if exit_code == 0: waagent_log_info(message) hutil_log_info(message) exit_status = 'success' else: waagent_log_error(message) hutil_log_error(message) exit_status = 'failed' if HUtilObject is not None: HUtilObject.do_exit(exit_code, operation, exit_status, str(exit_code), message) else: update_status_file(operation, str(exit_code), exit_status, message) sys.exit(exit_code) # Exceptions # If these exceptions are expected to be caught by the main method, they # include an error_code field with an integer with which to exit from main class AzureMonitorAgentForLinuxException(Exception): """ Base exception class for all exceptions; as such, its error code is the basic error code traditionally returned in Linux: 1 """ error_code = GenericErrorCode def get_error_message(self, operation): """ Return a descriptive error message based on this type of exception """ return '{0} failed with exit code {1}'.format(operation, self.error_code) class ParameterMissingException(AzureMonitorAgentForLinuxException): """ There is a missing parameter for the AzureMonitorLinuxAgent Extension """ error_code = MissingorInvalidParameterErrorCode def get_error_message(self, operation): return '{0} failed due to a missing parameter: {1}'.format(operation, self.error_code) if __name__ == '__main__' : main()

AzureMonitorAgent/agent.py (1,678 lines of code) (raw):