int main()

in meta-facebook/meta-cmm/recipes-core/fan-ctrl/fan-ctrl/fand.cpp [1422:1757]


int main(int argc, char **argv) {
  /* Sensor values */
  int critical_temp;
  int alarm_temp;
  int old_temp = GALAXY100_RAISING_TEMP_HIGH;
  int raising_pwm;
  int falling_pwm;
  struct galaxy100_fantray_info_stu_sysfs *info;
  int fan_speed = fan_medium;
  int failed_speed = 0;

  int bad_reads = 0;
  int fan_failure = 0;
  int fan_speed_changes = 0;
  int old_speed;
  int count = 0;

  int fan_bad[FANS];
  int fan;

  int sysfs_rc = 0;
  int sysfs_value = 0;

  unsigned log_count = 0; // How many times have we logged our temps?
  int opt;
  int prev_fans_bad = 0;

  struct sigaction sa;

  // Initialize path cache
  init_path_cache();

  // Start writing to syslog as early as possible for diag purposes.
  openlog("fand", LOG_CONS, LOG_DAEMON);

  while ((opt = getopt(argc, argv, "l:m:h:b:t:r:v")) != -1) {
    switch (opt) {
    case 'l':
      fan_low = atoi(optarg);
      break;
    case 'm':
      fan_medium = atoi(optarg);
      break;
    case 'h':
      fan_high = atoi(optarg);
      break;
    case 'b':
      temp_bottom = INTERNAL_TEMPS(atoi(optarg));
      break;
    case 't':
      temp_top = INTERNAL_TEMPS(atoi(optarg));
      break;
    case 'r':
      report_temp = atoi(optarg);
      break;
    case 'v':
      verbose = true;
      break;
    default:
      usage();
      break;
    }
  }

  if (optind > argc) {
    usage();
  }

  if (temp_bottom > temp_top) {
    fprintf(stderr,
            "Should temp-bottom (%d) be higher than "
            "temp-top (%d)?  Starting anyway.\n",
            EXTERNAL_TEMPS(temp_bottom),
            EXTERNAL_TEMPS(temp_top));
  }

  if (fan_low > fan_medium || fan_low > fan_high || fan_medium > fan_high) {
    fprintf(stderr,
            "fan RPMs not strictly increasing "
            "-- %d, %d, %d, starting anyway\n",
            fan_low,
            fan_medium,
            fan_high);
  }

  daemon(1, 0);

  if (verbose) {
    syslog(LOG_DEBUG, "Starting up;  system should have %d fans.",
           total_fans);
  }

  sysfs_rc = read_sysfs_int("/sys/bus/i2c/drivers/cmmcpld/13-003e/slotid",
                             &sysfs_value);
  if ((sysfs_rc == 0) &&(sysfs_value == 0x1)) {
    // on CMM1
    galaxy100_chanel_reinit_sysfs();
  }
  if(galaxy100_cmm_is_master() == 1) {
    for (fan = 0; fan < total_fans; fan++) {
      fan_bad[fan] = 0;
      write_fan_speed(fan + fan_offset, fan_speed);
      write_fan_led(fan + fan_offset, FAN_LED_BLUE);
    }
  }

  /* Start watchdog in manual mode */
  open_watchdog(0, 0);

  /* Set watchdog to persistent mode so timer expiry will happen independent
   * of this process's liveliness. */
  watchdog_disable_magic_close();

  /* We pet the watchdog here once, so that fan cpld will stop
   * forcing the fan speed to be 50% - this needs to be done before
   * we set fan speed to appropriate values for the first time
   * in the very first pass of the while loop below.
   */
  clear_fancpld_watchdog_timer();

  sleep(5);  /* Give the fans time to come up to speed */

  while (1) {
    int max_temp;
    old_speed = fan_speed;

    /*if it is master, then run next*/
    if(galaxy100_cmm_is_master() != 1) {
      sleep(5);
      kick_watchdog();
      continue;
    }

    usleep(11000);
    /* Read sensors */
    critical_temp = read_critical_max_temp();
    alarm_temp = read_alarm_max_temp();

    if ((critical_temp == BAD_TEMP || alarm_temp == BAD_TEMP)) {
      bad_reads++;
    }

    if (log_count++ % report_temp == 0) {
      syslog(LOG_DEBUG,
             "critical temp %d, alarm temp %d, fan speed %d, speed changes %d",
             critical_temp, alarm_temp, fan_speed, fan_speed_changes);
    }

    /* Protection heuristics */
    if(critical_temp > GALAXY100_SYSTEM_LIMIT) {
      system_shutdown("Critical temp limit reached");
    }

    /*
     * Calculate change needed -- we should eventually
     * do something more sophisticated, like PID.
     *
     * We should use the intake temperature to adjust this
     * as well.
     */

    /* Other systems use a simpler built-in table to determine fan speed. */
    raising_pwm = calculate_raising_fan_pwm(critical_temp);
    falling_pwm = calculate_falling_fan_pwm(critical_temp);
    if(old_temp <= critical_temp) {
      /*raising*/
      if(raising_pwm >= fan_speed) {
        fan_speed = raising_pwm;
      }
    } else {
      /*falling*/
      if(falling_pwm <= fan_speed ) {
        fan_speed = falling_pwm;
      }
    }
    old_temp = critical_temp;
    /*
     * Update fans only if there are no failed ones. If any fans failed
     * earlier, all remaining fans should continue to run at max speed.
     */
    if (fan_failure == 0 && fan_speed != old_speed) {
      syslog(LOG_NOTICE,
             "critical temp %d, alarm temp %d, fan speed %d, speed changes %d",
             critical_temp, alarm_temp, fan_speed, fan_speed_changes);
      syslog(LOG_NOTICE,
             "Fan speed changing from %d to %d",
             old_speed,
             fan_speed);
      fan_speed_changes++;
      for (fan = 0; fan < total_fans; fan++) {
        write_fan_speed(fan + fan_offset, fan_speed);
      }
    }
    /*
     * Wait for some change.  Typical I2C temperature sensors
     * only provide a new value every second and a half, so
     * checking again more quickly than that is a waste.
     *
     * We also have to wait for the fan changes to take effect
     * before measuring them.
     */

    sleep(5);
    galaxy100_lc_present_detect();
    galaxy100_scm_present_detect();

    /* Check fan RPMs */

    for (fan = 0; fan < total_fans; fan++) {
      /*
       * Make sure that we're within some percentage
       * of the requested speed.
       */
      if (fan_speed_okay(fan + fan_offset, fan_speed, FAN_FAILURE_OFFSET)) {
        if (fan_bad[fan] >= FAN_FAILURE_THRESHOLD) {
          write_fan_led(fan + fan_offset, FAN_LED_BLUE);
          syslog(LOG_CRIT,
                 "Fan %d has recovered",
                 fan);
        }
        fan_bad[fan] = 0;
      } else {
        info = &galaxy100_fantray_info[fan];
        if(info->present == 0 | info->fan1.failed == 1 | info->fan1.present == 0 |
           info->fan2.failed == 1 | info->fan2.present == 0 |
           info->fan3.failed == 1 | info->fan3.present == 0)
          fan_bad[fan]++;

      }
    }

    fan_failure = 0;
    for (fan = 0; fan < total_fans; fan++) {
      if (fan_bad[fan] >= FAN_FAILURE_THRESHOLD) {
        fan_failure++;
        write_fan_led(fan + fan_offset, FAN_LED_RED);
      }
    }
    if (fan_failure > 0) {
      if (prev_fans_bad != fan_failure) {
        syslog(LOG_CRIT, "%d fans failed", fan_failure);
      }

      /*
       * If fans are bad, we need to blast all of the
       * fans at 100%;  we don't bother to turn off
       * the bad fans, in case they are all that is left.
       *
       * Note that we have a temporary bug with setting fans to
       * 100% so we only do fan_max = 99%.
       */
      if (fan_failure > 0) {
        int not_present = 0;
        int fan_failed = 0;

        for (fan = 0; fan < total_fans; fan++) {
          info = &galaxy100_fantray_info[fan];
          if(info->present == 0) {
            not_present++;
            // Make sure that all fans on a FAB is marked as absent if that
            // fab itself is absent. Thus, one FAB down is equal to
            // three FANs down
            info->fan1.present = 0;
            info->fan2.present = 0;
            info->fan3.present = 0;
          }
          if(info->fan1.failed == 1 | info->fan1.present == 0)
            fan_failed++;
          if(info->fan2.failed == 1 | info->fan2.present == 0)
            fan_failed++;
          if(info->fan3.failed == 1 | info->fan3.present == 0)
            fan_failed++;
        }
        if(fan_failed >= 6)
          count++;
        else
          count = 0;
        if (count >= FAN_SHUTDOWN_THRESHOLD) {
          system_shutdown("two FCB failed more than 5 mins");
        }
#if 0
        if(fan_failed == 1) {
          fan_speed += GALAXY100_FAN_ONEFAILED_RAISE_PEC;
          if(fan_speed > fan_max)
            fan_speed = fan_max;
        } else
          fan_speed = fan_max;
#endif
        fan_failed += not_present * 3;
        if(fan_failed > 0 && fan_failed <= 3) {
          if(fan_speed == GALAXY100_FAN_LOW) {
            failed_speed = galaxy100_fan_failed_control[fan_failed - 1].low_level;
          } else if(fan_speed == GALAXY100_FAN_MEDIUM) {
            failed_speed = galaxy100_fan_failed_control[fan_failed - 1].mid_level;
          } else if(fan_speed == GALAXY100_FAN_HIGH) {
            failed_speed = galaxy100_fan_failed_control[fan_failed - 1].high_level;
          } else if(fan_speed == GALAXY100_FAN_MAX) {
            failed_speed = galaxy100_fan_failed_control[fan_failed - 1].alarm_level;
          }
        } else {
          failed_speed = fan_max;
        }
        for (fan = 0; fan < total_fans; fan++) {
          write_fan_speed(fan + fan_offset, failed_speed);
        }
      }

      /*
       * Fans can be hot swapped and replaced; in which case the fan daemon
       * will automatically detect the new fan and (assuming the new fan isn't
       * itself faulty), automatically readjust the speeds for all fans down
       * to a more suitable rpm. The fan daemon does not need to be restarted.
       */
    } else if(prev_fans_bad != 0 && fan_failure == 0){
      old_temp = GALAXY100_RAISING_TEMP_HIGH;
      fan_speed = fan_medium;
      for (fan = 0; fan < total_fans; fan++) {
        write_fan_speed(fan + fan_offset, fan_speed);
      }
    }
    /* Suppress multiple warnings for similar number of fan failures. */
    prev_fans_bad = fan_failure;

    /*
     * Do the best effort to CMM_BMC_HEARTBEAT counter in every fancplds.
     * The watchdog timer expires in 500 secondds, if we don't clear the
     * counter
     */
    clear_fancpld_watchdog_timer();

    /* if everything is fine, restart the watchdog countdown. If this process
     * is terminated, the persistent watchdog setting will cause the system
     * to reboot after the watchdog timeout. */
    kick_watchdog();
  }
}