in meta-facebook/meta-cmm/recipes-core/fan-ctrl/fan-ctrl/fand.cpp [1422:1757]
int main(int argc, char **argv) {
/* Sensor values */
int critical_temp;
int alarm_temp;
int old_temp = GALAXY100_RAISING_TEMP_HIGH;
int raising_pwm;
int falling_pwm;
struct galaxy100_fantray_info_stu_sysfs *info;
int fan_speed = fan_medium;
int failed_speed = 0;
int bad_reads = 0;
int fan_failure = 0;
int fan_speed_changes = 0;
int old_speed;
int count = 0;
int fan_bad[FANS];
int fan;
int sysfs_rc = 0;
int sysfs_value = 0;
unsigned log_count = 0; // How many times have we logged our temps?
int opt;
int prev_fans_bad = 0;
struct sigaction sa;
// Initialize path cache
init_path_cache();
// Start writing to syslog as early as possible for diag purposes.
openlog("fand", LOG_CONS, LOG_DAEMON);
while ((opt = getopt(argc, argv, "l:m:h:b:t:r:v")) != -1) {
switch (opt) {
case 'l':
fan_low = atoi(optarg);
break;
case 'm':
fan_medium = atoi(optarg);
break;
case 'h':
fan_high = atoi(optarg);
break;
case 'b':
temp_bottom = INTERNAL_TEMPS(atoi(optarg));
break;
case 't':
temp_top = INTERNAL_TEMPS(atoi(optarg));
break;
case 'r':
report_temp = atoi(optarg);
break;
case 'v':
verbose = true;
break;
default:
usage();
break;
}
}
if (optind > argc) {
usage();
}
if (temp_bottom > temp_top) {
fprintf(stderr,
"Should temp-bottom (%d) be higher than "
"temp-top (%d)? Starting anyway.\n",
EXTERNAL_TEMPS(temp_bottom),
EXTERNAL_TEMPS(temp_top));
}
if (fan_low > fan_medium || fan_low > fan_high || fan_medium > fan_high) {
fprintf(stderr,
"fan RPMs not strictly increasing "
"-- %d, %d, %d, starting anyway\n",
fan_low,
fan_medium,
fan_high);
}
daemon(1, 0);
if (verbose) {
syslog(LOG_DEBUG, "Starting up; system should have %d fans.",
total_fans);
}
sysfs_rc = read_sysfs_int("/sys/bus/i2c/drivers/cmmcpld/13-003e/slotid",
&sysfs_value);
if ((sysfs_rc == 0) &&(sysfs_value == 0x1)) {
// on CMM1
galaxy100_chanel_reinit_sysfs();
}
if(galaxy100_cmm_is_master() == 1) {
for (fan = 0; fan < total_fans; fan++) {
fan_bad[fan] = 0;
write_fan_speed(fan + fan_offset, fan_speed);
write_fan_led(fan + fan_offset, FAN_LED_BLUE);
}
}
/* Start watchdog in manual mode */
open_watchdog(0, 0);
/* Set watchdog to persistent mode so timer expiry will happen independent
* of this process's liveliness. */
watchdog_disable_magic_close();
/* We pet the watchdog here once, so that fan cpld will stop
* forcing the fan speed to be 50% - this needs to be done before
* we set fan speed to appropriate values for the first time
* in the very first pass of the while loop below.
*/
clear_fancpld_watchdog_timer();
sleep(5); /* Give the fans time to come up to speed */
while (1) {
int max_temp;
old_speed = fan_speed;
/*if it is master, then run next*/
if(galaxy100_cmm_is_master() != 1) {
sleep(5);
kick_watchdog();
continue;
}
usleep(11000);
/* Read sensors */
critical_temp = read_critical_max_temp();
alarm_temp = read_alarm_max_temp();
if ((critical_temp == BAD_TEMP || alarm_temp == BAD_TEMP)) {
bad_reads++;
}
if (log_count++ % report_temp == 0) {
syslog(LOG_DEBUG,
"critical temp %d, alarm temp %d, fan speed %d, speed changes %d",
critical_temp, alarm_temp, fan_speed, fan_speed_changes);
}
/* Protection heuristics */
if(critical_temp > GALAXY100_SYSTEM_LIMIT) {
system_shutdown("Critical temp limit reached");
}
/*
* Calculate change needed -- we should eventually
* do something more sophisticated, like PID.
*
* We should use the intake temperature to adjust this
* as well.
*/
/* Other systems use a simpler built-in table to determine fan speed. */
raising_pwm = calculate_raising_fan_pwm(critical_temp);
falling_pwm = calculate_falling_fan_pwm(critical_temp);
if(old_temp <= critical_temp) {
/*raising*/
if(raising_pwm >= fan_speed) {
fan_speed = raising_pwm;
}
} else {
/*falling*/
if(falling_pwm <= fan_speed ) {
fan_speed = falling_pwm;
}
}
old_temp = critical_temp;
/*
* Update fans only if there are no failed ones. If any fans failed
* earlier, all remaining fans should continue to run at max speed.
*/
if (fan_failure == 0 && fan_speed != old_speed) {
syslog(LOG_NOTICE,
"critical temp %d, alarm temp %d, fan speed %d, speed changes %d",
critical_temp, alarm_temp, fan_speed, fan_speed_changes);
syslog(LOG_NOTICE,
"Fan speed changing from %d to %d",
old_speed,
fan_speed);
fan_speed_changes++;
for (fan = 0; fan < total_fans; fan++) {
write_fan_speed(fan + fan_offset, fan_speed);
}
}
/*
* Wait for some change. Typical I2C temperature sensors
* only provide a new value every second and a half, so
* checking again more quickly than that is a waste.
*
* We also have to wait for the fan changes to take effect
* before measuring them.
*/
sleep(5);
galaxy100_lc_present_detect();
galaxy100_scm_present_detect();
/* Check fan RPMs */
for (fan = 0; fan < total_fans; fan++) {
/*
* Make sure that we're within some percentage
* of the requested speed.
*/
if (fan_speed_okay(fan + fan_offset, fan_speed, FAN_FAILURE_OFFSET)) {
if (fan_bad[fan] >= FAN_FAILURE_THRESHOLD) {
write_fan_led(fan + fan_offset, FAN_LED_BLUE);
syslog(LOG_CRIT,
"Fan %d has recovered",
fan);
}
fan_bad[fan] = 0;
} else {
info = &galaxy100_fantray_info[fan];
if(info->present == 0 | info->fan1.failed == 1 | info->fan1.present == 0 |
info->fan2.failed == 1 | info->fan2.present == 0 |
info->fan3.failed == 1 | info->fan3.present == 0)
fan_bad[fan]++;
}
}
fan_failure = 0;
for (fan = 0; fan < total_fans; fan++) {
if (fan_bad[fan] >= FAN_FAILURE_THRESHOLD) {
fan_failure++;
write_fan_led(fan + fan_offset, FAN_LED_RED);
}
}
if (fan_failure > 0) {
if (prev_fans_bad != fan_failure) {
syslog(LOG_CRIT, "%d fans failed", fan_failure);
}
/*
* If fans are bad, we need to blast all of the
* fans at 100%; we don't bother to turn off
* the bad fans, in case they are all that is left.
*
* Note that we have a temporary bug with setting fans to
* 100% so we only do fan_max = 99%.
*/
if (fan_failure > 0) {
int not_present = 0;
int fan_failed = 0;
for (fan = 0; fan < total_fans; fan++) {
info = &galaxy100_fantray_info[fan];
if(info->present == 0) {
not_present++;
// Make sure that all fans on a FAB is marked as absent if that
// fab itself is absent. Thus, one FAB down is equal to
// three FANs down
info->fan1.present = 0;
info->fan2.present = 0;
info->fan3.present = 0;
}
if(info->fan1.failed == 1 | info->fan1.present == 0)
fan_failed++;
if(info->fan2.failed == 1 | info->fan2.present == 0)
fan_failed++;
if(info->fan3.failed == 1 | info->fan3.present == 0)
fan_failed++;
}
if(fan_failed >= 6)
count++;
else
count = 0;
if (count >= FAN_SHUTDOWN_THRESHOLD) {
system_shutdown("two FCB failed more than 5 mins");
}
#if 0
if(fan_failed == 1) {
fan_speed += GALAXY100_FAN_ONEFAILED_RAISE_PEC;
if(fan_speed > fan_max)
fan_speed = fan_max;
} else
fan_speed = fan_max;
#endif
fan_failed += not_present * 3;
if(fan_failed > 0 && fan_failed <= 3) {
if(fan_speed == GALAXY100_FAN_LOW) {
failed_speed = galaxy100_fan_failed_control[fan_failed - 1].low_level;
} else if(fan_speed == GALAXY100_FAN_MEDIUM) {
failed_speed = galaxy100_fan_failed_control[fan_failed - 1].mid_level;
} else if(fan_speed == GALAXY100_FAN_HIGH) {
failed_speed = galaxy100_fan_failed_control[fan_failed - 1].high_level;
} else if(fan_speed == GALAXY100_FAN_MAX) {
failed_speed = galaxy100_fan_failed_control[fan_failed - 1].alarm_level;
}
} else {
failed_speed = fan_max;
}
for (fan = 0; fan < total_fans; fan++) {
write_fan_speed(fan + fan_offset, failed_speed);
}
}
/*
* Fans can be hot swapped and replaced; in which case the fan daemon
* will automatically detect the new fan and (assuming the new fan isn't
* itself faulty), automatically readjust the speeds for all fans down
* to a more suitable rpm. The fan daemon does not need to be restarted.
*/
} else if(prev_fans_bad != 0 && fan_failure == 0){
old_temp = GALAXY100_RAISING_TEMP_HIGH;
fan_speed = fan_medium;
for (fan = 0; fan < total_fans; fan++) {
write_fan_speed(fan + fan_offset, fan_speed);
}
}
/* Suppress multiple warnings for similar number of fan failures. */
prev_fans_bad = fan_failure;
/*
* Do the best effort to CMM_BMC_HEARTBEAT counter in every fancplds.
* The watchdog timer expires in 500 secondds, if we don't clear the
* counter
*/
clear_fancpld_watchdog_timer();
/* if everything is fine, restart the watchdog countdown. If this process
* is terminated, the persistent watchdog setting will cause the system
* to reboot after the watchdog timeout. */
kick_watchdog();
}
}