common/recipes-core/fscd3/fscd/fscd.py (744 lines of code) (raw):

#!/usr/bin/env python3 # # Copyright 2015-present Facebook. All Rights Reserved. # # This program file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with this program in a file named COPYING; if not, write to the # Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, # Boston, MA 02110-1301 USA # import ctypes import datetime import json import os.path import signal import sys import threading import time import traceback from contextlib import contextmanager import fsc_expr import kv from fsc_bmcmachine import BMCMachine from fsc_board import board_callout, board_fan_actions, board_host_actions from fsc_common_var import fan_mode from fsc_profile import Sensor, profile_constructor from fsc_sensor import FscSensorSourceUtil, FscSensorSourceJson from fsc_util import Logger, clamp from fsc_zone import Fan, Zone, BoardFanMode try: libwatchdog = ctypes.CDLL("libwatchdog.so") except OSError: # Sometimes libwatchdog is only available as libwatchdog.so.0 libwatchdog = ctypes.CDLL("libwatchdog.so.0") RECORD_DIR = "/tmp/cache_store/" SENSOR_FAIL_RECORD_DIR = "/tmp/sensorfail_record/" FAN_FAIL_RECORD_DIR = "/tmp/fanfail_record/" RAMFS_CONFIG = "/etc/fsc-config.json" CONFIG_DIR = "/etc/fsc" FAN_DEAD_REARM_KEY = "fan_dead_rearm" # Enable the following for testing only # RAMFS_CONFIG = '/tmp/fsc-config.json' # CONFIG_DIR = '/tmp' DEFAULT_INIT_BOOST = 100 DEFAULT_INIT_TRANSITIONAL = 70 class LibWatchdogError(Exception): pass def open_watchdog(): """ Open the watchdog. Once watchdog is opened by fscd, other processes won't be able to control watchdog until watchdog is closed by fscd. """ if libwatchdog.open_watchdog(0, 0) != 0: raise LibWatchdogError("Failed to open watchdog") def release_watchdog(): """ Close the watchdog opened by watchdog_open(). """ if libwatchdog.release_watchdog() != 0: raise LibWatchdogError("Failed to release watchdog") def kick_watchdog(): """kick the watchdog device.""" if libwatchdog.kick_watchdog() != 0: raise LibWatchdogError("Failed to kick watchdog") def stop_watchdog(): """stop the watchdog timer.""" ret = libwatchdog.stop_watchdog() if ret != 0: raise LibWatchdogError("stop_watchdog() returned " + str(ret)) def fscd_setup_watchdog(): """ Open the watchdog and start a thread to kick the watchdog periodically. """ # Note: we will not release the watchdog until fscd exists, and it # also means other processes won't be able to control watchdog unless # killing fscd. open_watchdog() # An extra kicking is not necessary, but it doesn't hurt to do so. kick_watchdog() # # XXX is it a good idea to kick watchdog in a separate thread??? # If fscd main thread fscd.run() got stuck, then BMC will be running # without thermal control, and watchdog cannot help us in this case. # _WATCHDOG_THREAD.start() def fscd_release_watchdog(stop_wdt=False): """stop the watchdog device.""" if _WATCHDOG_THREAD.is_alive(): Logger.info("Stopping watchdog thread") _WATCHDOG_STOP.set() _WATCHDOG_THREAD.join() Logger.info("Watchdog thread stopped") if stop_wdt: stop_watchdog() Logger.info("watchdog stopped") release_watchdog() ## Watchdog thread definition, see {start,stop}_watchdog() above def _watchdog_thread_f(): while not _WATCHDOG_STOP.is_set(): kick_watchdog() time.sleep(5) _WATCHDOG_THREAD = threading.Thread(target=_watchdog_thread_f, daemon=True) _WATCHDOG_STOP = threading.Event() class Fscd(object): DEFAULT_BOOST = 100 DEFAULT_BOOST_TYPE = "default" DEFAULT_TRANSITIONAL = 70 DEFAULT_RAMP_RATE = 10 def __init__( self, config=RAMFS_CONFIG, zone_config=CONFIG_DIR, log_level="warning" ): Logger.start("fscd", log_level) Logger.info("Starting fscd") self.zone_config = zone_config self.fsc_config = self.get_fsc_config(config) # json dump from config self.boost = self.DEFAULT_BOOST self.non_fanfail_limited_boost = None self.boost_type = self.DEFAULT_BOOST_TYPE self.transitional = self.DEFAULT_TRANSITIONAL self.ramp_rate = self.DEFAULT_RAMP_RATE self.sensor_fail = None self.ssd_progressive_algorithm = None self.sensor_valid_check = None self.fail_sensor_type = None self.fan_dead_boost = None self.fan_fail = None self.fan_recovery_pending = False self.fan_recovery_time = None self.fan_limit_upper_pwm = None self.fan_limit_lower_pwm = None self.sensor_filter_all = False self.sensor_fail_ignore = False self.pwm_sensor_boost_value = None self.output_max_boost_pwm = False self.board_fan_mode = BoardFanMode() self.need_rearm = False # TODO: Add checks for invalid config file path def get_fsc_config(self, fsc_config): if os.path.isfile(fsc_config): Logger.info("Started, reading configuration from %s" % (fsc_config)) with open(fsc_config, "r") as f: return json.load(f) def get_config_params(self): self.transitional = self.fsc_config["pwm_transition_value"] self.boost = self.fsc_config["pwm_boost_value"] if "fan_limit_upper_pwm" in self.fsc_config: self.fan_limit_upper_pwm = self.fsc_config["fan_limit_upper_pwm"] if "fan_limit_lower_pwm" in self.fsc_config: self.fan_limit_lower_pwm = self.fsc_config["fan_limit_lower_pwm"] if "non_fanfail_limited_boost_value" in self.fsc_config: self.non_fanfail_limited_boost = self.fsc_config[ "non_fanfail_limited_boost_value" ] self.sensor_filter_all = self.fsc_config.get("sensor_filter_all", False) self.sensor_fail_ignore = self.fsc_config.get("sensor_fail_ignore", False) if "boost" in self.fsc_config and "fan_fail" in self.fsc_config["boost"]: self.fan_fail = self.fsc_config["boost"]["fan_fail"] if "boost" in self.fsc_config and "progressive" in self.fsc_config["boost"]: if self.fsc_config["boost"]["progressive"]: self.boost_type = "progressive" if "fan_dead_boost" in self.fsc_config: self.fan_dead_boost = self.fsc_config["fan_dead_boost"] self.all_fan_fail_counter = 0 if "output_max_boost_pwm" in self.fsc_config: self.output_max_boost_pwm = self.fsc_config["output_max_boost_pwm"] if "boost" in self.fsc_config and "sensor_fail" in self.fsc_config["boost"]: self.sensor_fail = self.fsc_config["boost"]["sensor_fail"] if self.sensor_fail: if "pwm_sensor_boost_value" in self.fsc_config: self.pwm_sensor_boost_value = self.fsc_config[ "pwm_sensor_boost_value" ] if "fail_sensor_type" in self.fsc_config: self.fail_sensor_type = self.fsc_config["fail_sensor_type"] if "ssd_progressive_algorithm" in self.fsc_config: self.ssd_progressive_algorithm = self.fsc_config[ "ssd_progressive_algorithm" ] if "sensor_valid_check" in self.fsc_config: self.sensor_valid_check = self.fsc_config["sensor_valid_check"] self.watchdog = self.fsc_config["watchdog"] if "fanpower" in self.fsc_config: self.fanpower = self.fsc_config["fanpower"] else: self.fanpower = False if "chassis_intrusion" in self.fsc_config: self.chassis_intrusion = self.fsc_config["chassis_intrusion"] else: self.chassis_intrusion = False if "enable_fsc_sensor_check" in self.fsc_config: self.enable_fsc_sensor_check = self.fsc_config["enable_fsc_sensor_check"] else: self.enable_fsc_sensor_check = False if "ramp_rate" in self.fsc_config: self.ramp_rate = self.fsc_config["ramp_rate"] if self.watchdog: Logger.info("watchdog pinging enabled") fscd_setup_watchdog() self.interval = self.fsc_config["sample_interval_ms"] / 1000.0 if "fan_recovery_time" in self.fsc_config: self.fan_recovery_time = self.fsc_config["fan_recovery_time"] def build_profiles(self): self.sensors = {} self.profiles = {} for name, pdata in list(self.fsc_config["profiles"].items()): sensor = Sensor(name, pdata) if isinstance(sensor.source, FscSensorSourceJson): self.machine.extra_sensors[name] = sensor self.sensors[name] = sensor self.profiles[name] = profile_constructor(pdata) def build_fans(self): self.fans = {} for name, pdata in list(self.fsc_config["fans"].items()): self.fans[name] = Fan(name, pdata) def build_zones(self): self.zones = [] counter = 0 for name, data in list(self.fsc_config["zones"].items()): filename = data["expr_file"] with open(os.path.join(self.zone_config, filename), "r") as exf: source = exf.read() Logger.info("Compiling FSC expression for zone:") Logger.info(source) (expr, inf) = fsc_expr.make_eval_tree(source, self.profiles) for name in inf["ext_vars"]: sdata = name.split(":") board = sdata[0] # sname never used. so comment out (avoid lint error) # sname = sdata[1] if board not in self.machine.frus: self.machine.nums[board] = [] self.machine.frus.add(board) if len(sdata) == 3: self.machine.nums[board].append(sdata[2]) zone = Zone( data["pwm_output"], expr, inf, self.transitional, counter, self.boost, self.sensor_fail, self.sensor_valid_check, self.fail_sensor_type, self.ssd_progressive_algorithm, self.sensor_fail_ignore, ) counter += 1 self.zones.append(zone) def build_machine(self): self.machine = BMCMachine() def fsc_fan_action(self, fan, action): """ Method invokes board actions for a fan. """ if "dead" in action: board_fan_actions(fan, action="dead") board_fan_actions(fan, action="led_red") if "recover" in action: board_fan_actions(fan, action="recover") board_fan_actions(fan, action="led_blue") def fsc_host_action(self, action, cause): if "host_shutdown" in action: return board_host_actions(action="host_shutdown", cause=cause) # board_fan_actions(fan, action='led_blue') def fsc_set_all_fan_led(self, color): for fan, _value in list(self.fans.items()): board_fan_actions(self.fans[fan], action=color) def fsc_safe_guards(self, sensors_tuples): """ Method defines safe guards for fsc. Examples: Triggers board action when sensor temp read reaches limits configured in json """ for fru in self.machine.frus: for sensor, tuple in list(sensors_tuples[fru].items()): if tuple.name in self.fsc_config["profiles"]: if "read_limit" in self.fsc_config["profiles"][tuple.name]: # If temperature read exceeds accpetable temperature reading if ( "valid" in self.fsc_config["profiles"][tuple.name]["read_limit"] ): valid_table = self.fsc_config["profiles"][tuple.name][ "read_limit" ]["valid"] valid_read_limit = valid_table["limit"] valid_read_action = valid_table["action"] valid_read_th = valid_table["threshold"] # Use dict.get(key, None) avoid exception when no configuration valid_fault_tolerant = valid_table.get( "fault_tolerant", None ) if isinstance( self.sensors[tuple.name].source, FscSensorSourceUtil ): if tuple.value == None: self.sensors[ tuple.name ].source.read_source_fail_counter += 1 else: self.sensors[ tuple.name ].source.read_source_fail_counter = 0 if tuple.value > valid_read_limit: reason = ( sensor + "(v=" + str(tuple.value) + ") limit(t=" + str(valid_read_limit) + ") reached" ) if valid_fault_tolerant: Logger.info( "%s without action since fault_tolerant is enabled" % (reason) ) continue self.fsc_host_action( action=valid_read_action, cause=reason ) else: if tuple.value > valid_read_limit: if tuple.wrong_read_counter < valid_read_th: self.sensors[ tuple.name ].source.read_source_wrong_counter += 1 Logger.warn( "inlet_temp v=%d, and counter=%d" % (tuple.value, tuple.wrong_read_counter) ) continue reason = ( sensor + "(v=" + str(tuple.value) + ") limit(t=" + str(valid_read_limit) + ") reached" ) self.fsc_host_action( action=valid_read_action, cause=reason ) self.sensors[ tuple.name ].source.read_source_wrong_counter = 0 # If temperature read fails if ( "invalid" in self.fsc_config["profiles"][tuple.name]["read_limit"] ): invalid_table = self.fsc_config["profiles"][tuple.name][ "read_limit" ]["invalid"] invalid_read_th = invalid_table["threshold"] invalid_read_action = invalid_table["action"] if isinstance( self.sensors[tuple.name].source, FscSensorSourceUtil ): read_fail_counter = self.sensors[ tuple.name ].source.read_source_fail_counter if read_fail_counter >= invalid_read_th: reason = ( sensor + "(value=" + str(tuple.value) + ") failed to read " + str(read_fail_counter) + " times" ) self.fsc_host_action( action=invalid_read_action, cause=reason ) else: if tuple.read_fail_counter >= invalid_read_th: reason = ( sensor + "(value=" + str(tuple.value) + ") failed to read " + str(tuple.read_fail_counter) + " times" ) self.fsc_host_action( action=invalid_read_action, cause=reason ) def fsc_sensor_check(self, sensors_tuples): """ Monitor sensor temperature value This function checks whether any thermal sensor temp is over limit or not if it over limit it will boost the fan to full speed return 0 is normal 1 is sensor(s) valus is violate """ ret = 0 for fru in self.machine.frus: for sensor, tuple in list(sensors_tuples[fru].items()): if tuple.value is None: # Skip sensor if the reading fail continue if tuple.name in self.fsc_config["profiles"]: last_error_time = self.sensors[tuple.name].source.last_error_time last_error_level = self.sensors[tuple.name].source.last_error_level if "read_limit" in self.fsc_config["profiles"][tuple.name]: # If temperature read exceeds accpetable temperature reading if ( "alarm_major" in self.fsc_config["profiles"][tuple.name]["read_limit"] ): # Get value from configuration valid_table = self.fsc_config["profiles"][tuple.name][ "read_limit" ]["alarm_major"] valid_read_limit = valid_table["limit"] # Compare temp after offset with threshold value if tuple.value >= valid_read_limit: reason = ( sensor + "(alarm_major v=" + str(tuple.value) + ") limit(t=" + str(valid_read_limit) + ") reached" ) # change error flag and update last time error for Soak time feature last_error_level = "alarm_major" self.sensors[tuple.name].source.last_error_time = int( datetime.datetime.now().strftime("%s") ) # Do action if exist in configuration if "action" in valid_table: self.fsc_host_action( action=valid_table["action"], cause=reason ) Logger.warn(reason) ret = 1 # Skip alarm minor continue if ( "alarm_minor" in self.fsc_config["profiles"][tuple.name]["read_limit"] ): # Get value from configuration valid_table = self.fsc_config["profiles"][tuple.name][ "read_limit" ]["alarm_minor"] valid_read_limit = valid_table["limit"] # Compare temp after offset with threshold value if tuple.value >= valid_read_limit: reason = ( sensor + "(alarm_minor v=" + str(tuple.value) + ") limit(t=" + str(valid_read_limit) + ") reached" ) # change error flag and update last time error for Soak time feature if last_error_level is None: last_error_level = "alarm_minor" self.sensors[tuple.name].source.last_error_time = int( datetime.datetime.now().strftime("%s") ) # Do action if exist in configuration if "action" in valid_table: self.fsc_host_action( action=valid_table["action"], cause=reason ) Logger.warn(reason) ret = 1 elif ( "soak_time_s" in valid_table and last_error_level is not None ): elapsed_time = ( int(datetime.datetime.now().strftime("%s")) - last_error_time ) if elapsed_time < valid_table["soak_time_s"]: reason = ( sensor + "(alarm_minor elapsed_time = " + str(elapsed_time) + ",soak_time = " + str(valid_table["soak_time_s"]) + ") reached" ) if "action" in valid_table: self.fsc_host_action( action=valid_table["action"], cause=reason ) Logger.warn(reason) ret = 1 else: if "hysteresis" in valid_table: valid_hysteresis = abs( valid_table["hysteresis"] ) if tuple.value > ( valid_read_limit - valid_hysteresis ): reason = ( sensor + "(alarm_minor current v=" + str(tuple.value) + ") target(t=" + str( valid_read_limit - valid_hysteresis ) + ") soak_count (n=" + str( self.sensors[ tuple.name ].source.soak_repeat_counter + 1 ) + ") repeating" ) self.sensors[ tuple.name ].source.last_error_time = int( datetime.datetime.now().strftime("%s") ) self.sensors[ tuple.name ].source.soak_repeat_counter += 1 Logger.warn(reason) ret = 1 else: last_error_level = None else: last_error_level = None self.sensors[ tuple.name ].source.last_error_level = last_error_level return ret def check_fan_rearm(self): try: status = kv.kv_get(FAN_DEAD_REARM_KEY, 1) except kv.KeyNotFoundFailure: return False if status == "1": kv.kv_set(FAN_DEAD_REARM_KEY, "0", 1) return True else: return False def update_dead_fans(self, dead_fans): """ Check for dead and recovered fans """ last_dead_fans = dead_fans.copy() speeds = self.machine.read_fans(self.fans) for fan, rpms in list(speeds.items()): Logger.info("%s speed: %d RPM" % (fan.label, rpms)) if rpms < self.fsc_config["min_rpm"]: dead_fans.add(fan) self.fsc_fan_action(fan, action="dead") else: dead_fans.discard(fan) recovered_fans = last_dead_fans - dead_fans newly_dead_fans = dead_fans - last_dead_fans if len(newly_dead_fans) > 0 or (self.need_rearm and len(dead_fans) > 0): if self.fanpower: Logger.warn("%d fans failed" % (len(dead_fans),)) else: Logger.crit("%d fans failed" % (len(dead_fans),)) for dead_fan in dead_fans: if self.fanpower: Logger.warn("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) else: Logger.crit("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) Logger.usbdbg("%s fail" % (dead_fan.label)) fan_fail_record_path = FAN_FAIL_RECORD_DIR + "%s" % (dead_fan.label) if not os.path.isfile(fan_fail_record_path): try: fan_fail_record = open(fan_fail_record_path, "w") fan_fail_record.close() except FileNotFoundError: Logger.warn( "Cannot create failure record for %s" % (dead_fan.label) ) for fan in recovered_fans: if self.fanpower: Logger.warn("%s has recovered" % (fan.label,)) else: Logger.crit("%s has recovered" % (fan.label,)) Logger.usbdbg("%s recovered" % (fan.label)) self.fsc_fan_action(fan, action="recover") fan_fail_record_path = FAN_FAIL_RECORD_DIR + "%s" % (fan.label) if os.path.isfile(fan_fail_record_path): os.remove(fan_fail_record_path) return dead_fans def update_zones(self, dead_fans, time_difference): """ TODO: Need to change logic here. # Platforms with chassis_intrusion mode enabled if chassis_intrusion: set the chassis_intrusion_boost_flag to 0 and then do necessary checks to set flag to 1 if chassis_intrusion_boost_flag: run boost mode else: run normal mode else # Platforms WITHOUT chassis_intrusion mode run normal mode # Platforms with enable_fsc_sensor_check mode enabled if enable_fsc_sensor_check: set the sensor_violated_flag to 0 and then do necessary checks to set flag to 1 if sensor_violated_flag: run boost mode else: run normal mode else # Platforms WITHOUT enable_fsc_sensor_check mode run normal mode """ self.need_rearm = self.check_fan_rearm() ctx = {} if not self.sensor_filter_all: sensors_tuples = self.machine.read_sensors(self.sensors, None) self.fsc_safe_guards(sensors_tuples) for zone in self.zones: if self.need_rearm: zone.transitional_assert_flag = False zone.missing_sensor_assert_flag = [False] * len( zone.expr_meta["ext_vars"] ) if self.sensor_filter_all: sensors_tuples = self.machine.read_sensors(self.sensors, zone.expr_meta) self.fsc_safe_guards(sensors_tuples) Logger.info("PWM: %s" % (json.dumps(zone.pwm_output))) mode = 0 chassis_intrusion_boost_flag = 0 sensor_violated_flag = 0 if self.chassis_intrusion: self_tray_pull_out = board_callout(callout="chassis_intrusion") if self_tray_pull_out == 1: chassis_intrusion_boost_flag = 1 if self.enable_fsc_sensor_check: Logger.info("enable_fsc_sensor_check") if self.fsc_sensor_check(sensors_tuples) != 0: sensor_violated_flag = 1 Logger.debug(" dead_fans(%d) " % len(dead_fans)) Logger.debug("Calculate") if chassis_intrusion_boost_flag == 0 and sensor_violated_flag == 0: ctx["dt"] = time_difference ctx["dead_fans"] = dead_fans ctx["last_pwm"] = zone.last_pwm ignore_fan_mode = False if self.non_fanfail_limited_boost and dead_fans: ignore_fan_mode = True pwmval = zone.run( sensors=sensors_tuples, ctx=ctx, ignore_mode=ignore_fan_mode ) mode = zone.get_set_fan_mode(mode, action="read") # if we set pwm_sensor_boost_value option, assign it to pwmval if ( self.pwm_sensor_boost_value != None and int(mode) == fan_mode["boost_mode"] ): if pwmval == self.boost: pwmval = self.pwm_sensor_boost_value else: pwmval = self.boost mode = fan_mode["boost_mode"] if self.fan_fail: boost_record_path = RECORD_DIR + "fan_fail_boost" if self.boost_type == "progressive" and self.fan_dead_boost: # Cases where we want to progressively bump PWMs dead = len(dead_fans) if dead > 0: Logger.info( "Progressive mode: Failed fans: %s" % (", ".join([str(i.label) for i in dead_fans])) ) for fan_count, rate in self.fan_dead_boost["data"]: if dead <= fan_count: pwmval = clamp(pwmval + (dead * rate), 0, 100) mode = fan_mode["normal_mode"] break else: pwmval = self.boost mode = fan_mode["boost_mode"] if not os.path.isfile(boost_record_path): fan_fail_boost_record = open(boost_record_path, "w") fan_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) else: if dead_fans: # If not progressive ,when there is 1 fan failed, boost all fans Logger.info( "Failed fans: %s" % (", ".join([str(i.label) for i in dead_fans])) ) if self.board_fan_mode.is_scenario_supported("one_fan_failure"): # user define ( set_fan_mode, set_fan_pwm, ) = self.board_fan_mode.get_board_fan_mode( "one_fan_failure" ) # choose the higher PWM pwmval = max(set_fan_pwm, pwmval) if int(pwmval) == int(set_fan_pwm): mode = set_fan_mode else: pwmval = zone.run( sensors=sensors_tuples, ctx=ctx, ignore_mode=False ) mode = zone.get_set_fan_mode(mode, action="read") else: # choose the higher PWM if self.output_max_boost_pwm: pwmval = self.boost if pwmval < self.boost else pwmval else: pwmval = self.boost mode = fan_mode["boost_mode"] if not os.path.isfile(boost_record_path): fan_fail_boost_record = open(boost_record_path, "w") fan_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if self.fan_dead_boost: # If all the fans failed take action after a few cycles if len(dead_fans) == len(self.fans): self.all_fan_fail_counter = self.all_fan_fail_counter + 1 Logger.warn( "Currently all fans failed for {} cycles".format( self.all_fan_fail_counter ) ) if ( self.fan_dead_boost["threshold"] and self.fan_dead_boost["action"] ): if ( self.all_fan_fail_counter >= self.fan_dead_boost["threshold"] ): self.fsc_host_action( action=self.fan_dead_boost["action"], cause="All fans are bad for more than " + str(self.fan_dead_boost["threshold"]) + " cycles", ) else: # If atleast 1 fan is working reset the counter self.all_fan_fail_counter = 0 if self.fan_limit_upper_pwm: if pwmval > self.fan_limit_upper_pwm: pwmval = self.fan_limit_upper_pwm if self.fan_limit_lower_pwm: if pwmval < self.fan_limit_lower_pwm: pwmval = self.fan_limit_lower_pwm # if no fan fail, the max of pwm is non_fanfail_limited_boost pwm: if self.non_fanfail_limited_boost and not dead_fans: pwmval = clamp(pwmval, 0, self.non_fanfail_limited_boost) if abs(zone.last_pwm - pwmval) > self.ramp_rate: if pwmval < zone.last_pwm: pwmval = zone.last_pwm - self.ramp_rate else: pwmval = zone.last_pwm + self.ramp_rate zone.last_pwm = pwmval if hasattr(zone.pwm_output, "__iter__"): for output in zone.pwm_output: self.machine.set_pwm(self.fans.get(str(output)), pwmval) else: self.machine.set_pwm(self.fans[zone.pwm_output], pwmval) zone.get_set_fan_mode(mode, action="write") def builder(self): """ Method to extract from json and build all internal data staructures """ # Build a bmc machine object - read/write sensors self.build_machine() # Extract everything from json self.get_config_params() self.build_fans() self.build_profiles() Logger.info("Available profiles: " + ", ".join(list(self.profiles.keys()))) self.build_zones() Logger.info("Read %d zones" % (len(self.zones))) Logger.info("Including sensors from: " + ", ".join(self.machine.frus)) def get_fan_power_status(self): """ Method invokes board action to determine fan power status. If not applicable returns True. """ if board_callout(callout="read_power"): return True return False def fail_record_dir(self): """ Create directory to store which sensors and fans failed """ if not os.path.isdir(RECORD_DIR): os.mkdir(RECORD_DIR) if not os.path.isdir(SENSOR_FAIL_RECORD_DIR): os.mkdir(SENSOR_FAIL_RECORD_DIR) if not os.path.isdir(FAN_FAIL_RECORD_DIR): os.mkdir(FAN_FAIL_RECORD_DIR) def run(self): """ Main FSCD method that builds from the fscd config and runs """ # Get everything from json and build profiles, fans, zones self.builder() self.fail_record_dir() self.machine.set_all_pwm(self.fans, self.transitional) self.fsc_set_all_fan_led(color="led_blue") mode = fan_mode["trans_mode"] self.zones[0].get_set_fan_mode(mode, action="write") last = time.time() dead_fans = set() if self.fanpower: time.sleep(30) while True: time.sleep(self.interval) if self.fanpower: if not self.get_fan_power_status(): self.fan_recovery_pending = True continue if self.fan_fail: if self.fan_recovery_pending and self.fan_recovery_time != None: # Accelerating, wait for a while time.sleep(self.fan_recovery_time) self.fan_recovery_pending = False # Get dead fans for determining speed dead_fans = self.update_dead_fans(dead_fans) now = time.time() time_difference = now - last last = now Logger.info("time_difference: %f" % (time_difference)) # Check sensors and update zones self.update_zones(dead_fans, time_difference) def handle_term(signum, frame): board_callout(callout="init_fans", boost=DEFAULT_INIT_TRANSITIONAL) Logger.warn("killed by signal %d" % (signum,)) if signum == signal.SIGQUIT: fscd_release_watchdog(stop_wdt=True) else: fscd_release_watchdog() sys.exit("killed") if __name__ == "__main__": try: signal.signal(signal.SIGTERM, handle_term) signal.signal(signal.SIGINT, handle_term) signal.signal(signal.SIGQUIT, handle_term) if len(sys.argv) > 1: llevel = sys.argv[1] else: llevel = "warning" fscd = Fscd(log_level=llevel) fscd.run() except Exception: board_callout(callout="init_fans", boost=DEFAULT_INIT_TRANSITIONAL) (etype, e) = sys.exc_info()[:2] Logger.crit("failed, exception: " + str(etype)) traceback.print_exc() for line in traceback.format_exc().split("\n"): Logger.crit(line) fscd_release_watchdog()