azurelinuxagent/ga/memorycontroller.py (111 lines of code) (raw):

# Copyright 2018 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ import errno import os import re from azurelinuxagent.common import logger from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.common.future import ustr from azurelinuxagent.ga.cgroupcontroller import _CgroupController, CounterNotFound, MetricValue, MetricsCategory, \ MetricsCounter, _REPORT_EVERY_HOUR class _MemoryController(_CgroupController): def __init__(self, name, cgroup_path): super(_MemoryController, self).__init__(name, cgroup_path) self._counter_not_found_error_count = 0 def _get_memory_stat_counter(self, counter_name): """ Gets the value for the provided counter in memory.stat """ try: with open(os.path.join(self.path, 'memory.stat')) as memory_stat: # # Sample file v1: # # cat memory.stat # cache 0 # rss 0 # rss_huge 0 # shmem 0 # mapped_file 0 # dirty 0 # writeback 0 # swap 0 # ... # # Sample file v2 # # cat memory.stat # anon 0 # file 147140608 # kernel 1421312 # kernel_stack 0 # pagetables 0 # sec_pagetables 0 # percpu 130752 # sock 0 # ... # for line in memory_stat: re_memory_counter = r'{0}\s+(\d+)'.format(counter_name) match = re.match(re_memory_counter, line) if match is not None: return int(match.groups()[0]) except (IOError, OSError) as e: if e.errno == errno.ENOENT: raise raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) except Exception as e: raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) raise CounterNotFound("Cannot find counter: {0}".format(counter_name)) def get_memory_usage(self): """ Collects anon and cache usage for the cgroup and returns as a tuple Returns anon and cache memory usage for the cgroup as a tuple -> (anon, cache) :return: Anon and cache memory usage in bytes :rtype: tuple[int, int] """ raise NotImplementedError() def try_swap_memory_usage(self): """ Collects swap usage for the cgroup :return: Memory usage in bytes :rtype: int """ raise NotImplementedError() def get_max_memory_usage(self): """ Collect max memory usage for the cgroup. :return: Memory usage in bytes :rtype: int """ raise NotImplementedError() def get_tracked_metrics(self, **_): # The log collector monitor tracks anon and cache memory separately. anon_mem_usage, cache_mem_usage = self.get_memory_usage() total_mem_usage = anon_mem_usage + cache_mem_usage return [ MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, total_mem_usage), MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.ANON_MEM_USAGE, self.name, anon_mem_usage), MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.CACHE_MEM_USAGE, self.name, cache_mem_usage), MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, self.get_max_memory_usage(), _REPORT_EVERY_HOUR), MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name, self.try_swap_memory_usage(), _REPORT_EVERY_HOUR) ] def get_unit_properties(self): return["MemoryAccounting"] def get_controller_type(self): return "memory" class MemoryControllerV1(_MemoryController): def get_memory_usage(self): # In v1, anon memory is reported in the 'rss' counter return self._get_memory_stat_counter("rss"), self._get_memory_stat_counter("cache") def try_swap_memory_usage(self): # In v1, swap memory should be collected from memory.stat, because memory.memsw.usage_in_bytes reports total Memory+SWAP. try: return self._get_memory_stat_counter("swap") except CounterNotFound as e: if self._counter_not_found_error_count < 1: logger.periodic_info(logger.EVERY_HALF_HOUR, '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path)) self._counter_not_found_error_count += 1 return 0 def get_max_memory_usage(self): # In v1, max memory usage is reported in memory.max_usage_in_bytes usage = 0 try: usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)) except Exception as e: if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 raise raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e) return usage class MemoryControllerV2(_MemoryController): def get_memory_usage(self): # In v2, cache memory is reported in the 'file' counter return self._get_memory_stat_counter("anon"), self._get_memory_stat_counter("file") def get_memory_throttled_events(self): """ Returns the number of times processes of the cgroup are throttled and routed to perform memory recliam because the high memory boundary was exceeded. :return: Number of memory throttling events for the cgroup :rtype: int """ try: with open(os.path.join(self.path, 'memory.events')) as memory_events: # # Sample file: # # cat memory.events # low 0 # high 0 # max 0 # oom 0 # oom_kill 0 # oom_group_kill 0 # for line in memory_events: match = re.match(r'high\s+(\d+)', line) if match is not None: return int(match.groups()[0]) except (IOError, OSError) as e: if e.errno == errno.ENOENT: raise raise CGroupsException("Failed to read memory.events: {0}".format(ustr(e))) except Exception as e: raise CGroupsException("Failed to read memory.events: {0}".format(ustr(e))) raise CounterNotFound("Cannot find memory.events counter: high") def try_swap_memory_usage(self): # In v2, swap memory is reported in memory.swap.current usage = 0 try: usage = int(self._get_parameters('memory.swap.current', first_line_only=True)) except Exception as e: if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 raise raise CGroupsException("Exception while attempting to read {0}".format("memory.swap.current"), e) return usage def get_max_memory_usage(self): # In v2, max memory usage is reported in memory.peak usage = 0 try: usage = int(self._get_parameters('memory.peak', first_line_only=True)) except Exception as e: if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 raise raise CGroupsException("Exception while attempting to read {0}".format("memory.peak"), e) return usage def get_tracked_metrics(self, **_): metrics = super(MemoryControllerV2, self).get_tracked_metrics() throttled_value = MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MEM_THROTTLED, self.name, self.get_memory_throttled_events()) metrics.append(throttled_value) return metrics