in meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp [1046:1442]
int main(int argc, char **argv) {
int fan_speed = fan_high;
int bad_reads = 0;
int fan_failure = 0;
int fan_speed_changes = 0;
int old_speed;
int fan_bad[MAX_FANS];
int fan;
unsigned log_count = 0; // How many times have we logged our temps?
int opt;
int prev_fans_bad = 0;
struct sigaction sa;
k_version_t k_version;
int intake_temp;
int exhaust_temp;
int switch_temp;
int userver_temp;
temp_sensor_t temp_sensors[] = {
{
.name = "intake_temp",
.sensor_data = &intake_temp,
.dev_path = I2C_SYSFS_DEV_DIR(3-0048),
.sensor_path = NULL,
},
{
.name = "switch_temp",
.sensor_data = &switch_temp,
.dev_path = I2C_SYSFS_DEV_DIR(3-0049),
.sensor_path = NULL,
},
{
.name = "exhaust_temp",
.sensor_data = &exhaust_temp,
.dev_path = I2C_SYSFS_DEV_DIR(3-004a),
.sensor_path = NULL,
},
{
.name = "userver_temp",
.sensor_data = &userver_temp,
.dev_path = I2C_SYSFS_DEV_DIR(4-0040),
.sensor_path = NULL,
},
/* make sure it's the last entry */
{
.name = NULL,
},
};
sa.sa_handler = fand_interrupt;
sa.sa_flags = 0;
sigemptyset(&sa.sa_mask);
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGINT, &sa, NULL);
sigaction(SIGUSR1, &sa, NULL);
// Start writing to syslog as early as possible for diag purposes.
openlog("fand", LOG_CONS, LOG_DAEMON);
if (is_two_fan_board(false)) {
/* Alternate, two fan configuration */
total_fans = 2;
fan_offset = 2; /* fan 3 is the first */
fan_low = SIXPACK_FAN_LOW;
fan_medium = SIXPACK_FAN_MEDIUM;
fan_high = SIXPACK_FAN_HIGH;
fan_max = SIXPACK_FAN_MAX;
fan_speed = fan_high;
}
while ((opt = getopt(argc, argv, "l:m:h:b:t:r:v")) != -1) {
switch (opt) {
case 'l':
fan_low = atoi(optarg);
break;
case 'm':
fan_medium = atoi(optarg);
break;
case 'h':
fan_high = atoi(optarg);
break;
case 'b':
temp_bottom = INTERNAL_TEMPS(atoi(optarg));
break;
case 't':
temp_top = INTERNAL_TEMPS(atoi(optarg));
break;
case 'r':
report_temp = atoi(optarg);
break;
case 'v':
verbose = true;
break;
default:
usage();
break;
}
}
if (optind > argc) {
usage();
}
if (temp_bottom > temp_top) {
fprintf(stderr,
"Should temp-bottom (%d) be higher than "
"temp-top (%d)? Starting anyway.\n",
EXTERNAL_TEMPS(temp_bottom),
EXTERNAL_TEMPS(temp_top));
}
LOG_VERBOSE("temperature settings: bottom=%d, top=%d",
temp_bottom, temp_top);
if (fan_low > fan_medium || fan_low > fan_high || fan_medium > fan_high) {
fprintf(stderr,
"fan RPMs not strictly increasing "
"-- %d, %d, %d, starting anyway\n",
fan_low,
fan_medium,
fan_high);
}
LOG_VERBOSE("fan speed settings: low=%d, medium=%d, high=%d",
fan_low, fan_medium, fan_high);
/*
* Determine which fan controller to be used based on kernel version.
*/
k_version = get_kernel_version();
if (k_version > KERNEL_VERSION(4, 1, 51)) {
active_fan_ctrl = &fan_controller_5x;
}
LOG_VERBOSE("Setting up %s", active_fan_ctrl->name);
if (active_fan_ctrl->init != NULL) {
if (active_fan_ctrl->init(active_fan_ctrl) != 0) {
syslog(LOG_CRIT, "unable to initialize %s!", active_fan_ctrl->name);
/*
* fall through regardless of init results, because we wish fand
* can still read temp sensors and shutdown the chassis in case temp
* is too high.
*/
}
}
/*
* Look up sysfs path for temp sensors
*/
if (temp_sensors_init(temp_sensors) < 0) {
return -1;
}
daemon(1, 0);
LOG_VERBOSE("Starting up; system should have %d fans.", total_fans);
for (fan = 0; fan < total_fans; fan++) {
fan_bad[fan] = 0;
write_fan_speed(fan + fan_offset, fan_speed);
update_fan_led(fan + fan_offset, FAN_LED_BLUE);
}
/* Start watchdog in manual mode */
open_watchdog(0, 0);
/* Set watchdog to persistent mode so timer expiry will happen independent
* of this process's liveliness. */
watchdog_disable_magic_close();
sleep(5); /* Give the fans time to come up to speed */
while (1) {
int max_temp;
old_speed = fan_speed;
LOG_VERBOSE("checking system temperature..");
read_temp_sensors(temp_sensors);
/*
* uServer can be powered down, but all of the rest of the sensors
* should be readable at any time.
*/
/* TODO(vineelak) : Add userver_temp too , in case we fail to read temp */
if ((intake_temp == BAD_TEMP || exhaust_temp == BAD_TEMP ||
switch_temp == BAD_TEMP)) {
bad_reads++;
}
if (bad_reads > BAD_READ_THRESHOLD) {
server_shutdown("Some sensors couldn't be read");
}
if (log_count++ % report_temp == 0) {
syslog(LOG_DEBUG,
"Temp intake %d, switch %d, "
" userver %d, exhaust %d, "
"fan speed %d, speed changes %d",
intake_temp,
switch_temp,
userver_temp,
exhaust_temp,
fan_speed,
fan_speed_changes);
}
/* Protection heuristics */
if (intake_temp > INTAKE_LIMIT) {
server_shutdown("Intake temp limit reached");
}
if (switch_temp > SWITCH_LIMIT) {
server_shutdown("T2 temp limit reached");
}
if (userver_temp + USERVER_TEMP_FUDGE > USERVER_LIMIT) {
syslog(LOG_DEBUG,
"Temp intake %d, switch %d, "
" userver %d, exhaust %d, "
"fan speed %d, speed changes %d",
intake_temp,
switch_temp,
userver_temp,
exhaust_temp,
fan_speed,
fan_speed_changes);
server_shutdown("uServer temp limit reached");
}
/*
* Calculate change needed -- we should eventually
* do something more sophisticated, like PID.
*
* We should use the intake temperature to adjust this
* as well.
*/
/* Other systems use a simpler built-in table to determine fan speed. */
if (switch_temp > userver_temp + USERVER_TEMP_FUDGE) {
max_temp = switch_temp;
} else {
max_temp = userver_temp + USERVER_TEMP_FUDGE;
}
LOG_VERBOSE("checking/adjusting fan speed..");
/*
* If recovering from a fan problem, spin down fans gradually in case
* temperatures are still high. Gradual spin down also reduces wear on
* the fans.
*/
if (fan_speed == fan_max) {
if (fan_failure == 0) {
fan_speed = fan_high;
}
} else if (fan_speed == fan_high) {
if (max_temp + COOLDOWN_SLOP < temp_top) {
fan_speed = fan_medium;
}
} else if (fan_speed == fan_medium) {
if (max_temp > temp_top) {
fan_speed = fan_high;
} else if (max_temp + COOLDOWN_SLOP < temp_bottom) {
fan_speed = fan_low;
}
} else {/* low */
if (max_temp > temp_bottom) {
fan_speed = fan_medium;
}
}
/*
* Update fans only if there are no failed ones. If any fans failed
* earlier, all remaining fans should continue to run at max speed.
*/
if (fan_failure == 0 && fan_speed != old_speed) {
syslog(LOG_NOTICE,
"Fan speed changing from %d to %d",
old_speed,
fan_speed);
fan_speed_changes++;
for (fan = 0; fan < total_fans; fan++) {
write_fan_speed(fan + fan_offset, fan_speed);
}
}
/*
* Wait for some change. Typical I2C temperature sensors
* only provide a new value every second and a half, so
* checking again more quickly than that is a waste.
*
* We also have to wait for the fan changes to take effect
* before measuring them.
*/
sleep(5);
/* Check fan RPMs */
for (fan = 0; fan < total_fans; fan++) {
/*
* Make sure that we're within some percentage
* of the requested speed.
*/
if (fan_speed_okay(fan + fan_offset, fan_speed, FAN_FAILURE_OFFSET)) {
if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) {
update_fan_led(fan + fan_offset, FAN_LED_BLUE);
syslog(LOG_CRIT,
"Fan %d has recovered",
fan);
}
fan_bad[fan] = 0;
} else {
fan_bad[fan]++;
}
}
fan_failure = 0;
for (fan = 0; fan < total_fans; fan++) {
if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) {
fan_failure++;
update_fan_led(fan + fan_offset, FAN_LED_RED);
}
}
if (fan_failure > 0) {
if (prev_fans_bad != fan_failure) {
syslog(LOG_CRIT, "%d fans failed", fan_failure);
}
/*
* If fans are bad, we need to blast all of the
* fans at 100%; we don't bother to turn off
* the bad fans, in case they are all that is left.
*
* Note that we have a temporary bug with setting fans to
* 100% so we only do fan_max = 99%.
*/
fan_speed = fan_max;
for (fan = 0; fan < total_fans; fan++) {
write_fan_speed(fan + fan_offset, fan_speed);
}
/*
* On Wedge, we want to shut down everything if none of the fans
* are visible, since there isn't automatic protection to shut
* off the server or switch chip. On other platforms, the CPUs
* generating the heat will automatically turn off, so this is
* unnecessary.
*/
if (fan_failure == total_fans) {
int count = 0;
for (fan = 0; fan < total_fans; fan++) {
if (fan_bad[fan] > FAN_SHUTDOWN_THRESHOLD)
count++;
}
if (count == total_fans) {
server_shutdown("all fans are bad for more than 12 cycles");
}
}
/*
* Fans can be hot swapped and replaced; in which case the fan daemon
* will automatically detect the new fan and (assuming the new fan isn't
* itself faulty), automatically readjust the speeds for all fans down
* to a more suitable rpm. The fan daemon does not need to be restarted.
*/
}
/* Suppress multiple warnings for similar number of fan failures. */
prev_fans_bad = fan_failure;
/* if everything is fine, restart the watchdog countdown. If this process
* is terminated, the persistent watchdog setting will cause the system
* to reboot after the watchdog timeout. */
LOG_VERBOSE("kicking watchdog");
kick_watchdog();
}
temp_sensors_destroy(temp_sensors);
if (active_fan_ctrl->cleanup != NULL) {
active_fan_ctrl->cleanup(active_fan_ctrl);
}
return 0;
}