in horovod/runner/driver/driver_service.py [0:0]
def get_common_interfaces(settings, all_host_names, remote_host_names=None, fn_cache=None):
'''
Find the set of common and routed interfaces on all the hosts.
:param settings: the object that contains the setting for running horovod
:type settings: horovod.runner.common.util.settings.Settings
:param all_host_names: list of the host names
:type all_host_names: list(string)
:param remote_host_names: list of the remote host names.
:type remote_host_names: list(string)
:param fn_cache: Cache storing the results of checks performed by horovod
:type fn_cache: horovod.runner.util.cache.Cache
:return: List of common interfaces
'''
# Skipping interface discovery for LSF cluster as it slows down considerably the job start
if lsf.LSFUtils.using_lsf():
return None
if remote_host_names is None:
remote_host_names = network.filter_local_addresses(all_host_names)
if len(remote_host_names) > 0:
if settings.nics:
# If args.nics is provided, we will use those interfaces. All the workers
# must have at least one of those interfaces available.
nics = settings.nics
else:
# Find the set of common, routed interfaces on all the hosts (remote
# and local) and specify it in the args to be used by NCCL. It is
# expected that the following function will find at least one interface
# otherwise, it will raise an exception.
if settings.verbose >= 2:
print('Testing interfaces on all the hosts.')
local_host_names = set(all_host_names) - set(remote_host_names)
nics = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache)
if settings.verbose >= 2:
print('Interfaces on all the hosts were successfully checked.')
print('Common interface found: ' + ' '.join(nics))
else:
if settings.verbose >= 2:
print('All hosts are local, finding the interfaces '
'with address 127.0.0.1')
# If all the given hosts are local, find the interfaces with address
# 127.0.0.1
nics = set()
for iface, addrs in net_if_addrs().items():
if settings.nics and iface not in settings.nics:
continue
for addr in addrs:
if addr.family == AF_INET and addr.address == '127.0.0.1':
nics.add(iface)
break
if len(nics) == 0:
raise ValueError('No interface is found for address 127.0.0.1.')
if settings.verbose >= 2:
print('Local interface found ' + ' '.join(nics))
return nics