in horovod/runner/elastic/registration.py [0:0]
def _on_workers_recorded(self):
logging.info('all {} workers recorded'.format(self.size()))
# Check for success state, if any process succeeded, shutdown all other processes
if self.count(SUCCESS) > 0:
logging.info('success count == {} -> stop running'.format(self.count(SUCCESS)))
self._driver.stop()
return
# Check that all processes failed, indicating that processing should stop
if self.count(FAILURE) == self._size:
logging.error('failure count == {} -> stop running'.format(self._size))
self._driver.stop()
return
# Check for failures, and add them to the blacklisted hosts list
failures = self.get(FAILURE)
for host, slot in failures:
self._host_manager.blacklist(host)
# If every active host is blacklisted, then treat this as job failure
if all([self._host_manager.is_blacklisted(host) for host, slot in self.get_recorded_slots()]):
logging.error('blacklisted slots count == {} -> stop running'.format(self._size))
self._driver.stop()
return
# Check that we have already reset the maximum number of allowed times
if self._reset_limit is not None and self._reset_count >= self._reset_limit:
logging.error('reset count {} has exceeded limit {} -> stop running'
.format(self._reset_count, self._reset_limit))
self._driver.stop(error_message=constants.RESET_LIMIT_EXCEEDED_MESSAGE.format(self._reset_limit))
return
try:
self._reset_count += 1
self._driver.resume()
except Exception:
logging.exception('failed to activate new hosts -> stop running')
self._driver.stop()