in horovod/runner/js_run.py [0:0]
def js_run(settings, nics, env, command, stdout=None, stderr=None):
"""
Runs Horovod with jsrun.
Args:
settings: Settings for running jsrun.
Note: settings.num_proc and settings.hosts must not be None.
nics: Interfaces to include by jsrun.
env: Environment dictionary to use for running jsrun.
command: Command and arguments to run as a list of string.
stdout: Stdout of the mpi process.
Only used when settings.run_func_mode is True.
stderr: Stderr of the mpi process.
Only used when settings.run_func_mode is True.
"""
mpi_impl_flags, _ = _get_mpi_implementation_flags(settings.tcp_flag, env=env)
if mpi_impl_flags is None:
raise Exception(_MPI_NOT_FOUND_ERROR_MSG)
if not is_jsrun_installed():
raise Exception(
'horovod does not find the jsrun command.\n\n'
'Please, make sure you are running on a cluster with jsrun installed or '
'use one of the other launchers.')
if nics and 'NCCL_SOCKET_IFNAME' not in env:
env['NCCL_SOCKET_IFNAME'] = ','.join(nics)
smpiargs = ' '.join(mpi_impl_flags)
if settings.extra_mpi_args:
smpiargs += ' ' + settings.extra_mpi_args
if settings.binding_args:
binding_args = settings.binding_args
else:
rf = generate_jsrun_rankfile(settings)
if settings.verbose >= 2:
safe_shell_exec.execute('cat {rf}'.format(rf=rf))
binding_args = '--erf_input {rf}'.format(rf=rf)
jsrun_command = (
'jsrun {binding_args} '
'{output_filename_arg} '
'{smpiargs} '
'{command}'
.format(binding_args = binding_args,
output_filename_arg='--stdio_stderr {file} --stdio_stdout {file}'.format(file=settings.output_filename)
if settings.output_filename else '',
smpiargs= '--smpiargs {args}'.format(args=quote(smpiargs)) if smpiargs else '',
command=' '.join(quote(par) for par in command))
)
if settings.verbose >= 2:
print(jsrun_command)
# Execute the jsrun command.
if settings.run_func_mode:
exit_code = safe_shell_exec.execute(jsrun_command, env=env, stdout=stdout, stderr=stderr)
if exit_code != 0:
raise RuntimeError("jsrun failed with exit code {exit_code}".format(exit_code=exit_code))
else:
os.execve('/bin/sh', ['/bin/sh', '-c', jsrun_command], env)