def _on_workers_recorded()

in horovod/runner/elastic/registration.py [0:0]


    def _on_workers_recorded(self):
        logging.info('all {} workers recorded'.format(self.size()))

        # Check for success state, if any process succeeded, shutdown all other processes
        if self.count(SUCCESS) > 0:
            logging.info('success count == {} -> stop running'.format(self.count(SUCCESS)))
            self._driver.stop()
            return

        # Check that all processes failed, indicating that processing should stop
        if self.count(FAILURE) == self._size:
            logging.error('failure count == {} -> stop running'.format(self._size))
            self._driver.stop()
            return

        # Check for failures, and add them to the blacklisted hosts list
        failures = self.get(FAILURE)
        for host, slot in failures:
            self._host_manager.blacklist(host)

        # If every active host is blacklisted, then treat this as job failure
        if all([self._host_manager.is_blacklisted(host) for host, slot in self.get_recorded_slots()]):
            logging.error('blacklisted slots count == {} -> stop running'.format(self._size))
            self._driver.stop()
            return

        # Check that we have already reset the maximum number of allowed times
        if self._reset_limit is not None and self._reset_count >= self._reset_limit:
            logging.error('reset count {} has exceeded limit {} -> stop running'
                          .format(self._reset_count, self._reset_limit))
            self._driver.stop(error_message=constants.RESET_LIMIT_EXCEEDED_MESSAGE.format(self._reset_limit))
            return

        try:
            self._reset_count += 1
            self._driver.resume()
        except Exception:
            logging.exception('failed to activate new hosts -> stop running')
            self._driver.stop()