in News Vendor/common/sagemaker_rl/mpi_launcher.py [0:0]
def _build_mpi_command(self):
"""Build MPI command.
"""
num_hosts = len(self.env.hosts)
num_processes = self.process_per_host * num_hosts
# By default, use one process per GPU, or one process per node (if training with CPU).
host_list = self.env.hosts if self.process_per_host == 1 else \
[host + ':{}'.format(self.process_per_host) for host in self.env.hosts]
print("Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format(self.env.hosts, host_list,
self.process_per_host,
num_processes))
credential_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN']
interface_name = interface_name = self.env.network_interface_name
if self.instance_type == "local":
interface_name = "eth0"
print('network interface name:' + interface_name + " " + str(self.instance_type))
mpi_command = 'mpirun --host {}'.format(",".join(host_list)) \
+ " -np {} ".format(num_processes) \
+ " --allow-run-as-root" \
+ " --display-map" \
+ " --tag-output" \
+ " -mca btl_tcp_if_include {}".format(interface_name) \
+ " -mca oob_tcp_if_include {}".format(interface_name) \
+ " -x NCCL_SOCKET_IFNAME={}".format(interface_name) \
+ " --mca plm_rsh_no_tree_spawn 1" \
+ " -mca orte_abort_on_non_zero_status 1" \
+ " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO" \
+ " -x LD_LIBRARY_PATH -x PATH" \
+ " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY)
for v in credential_vars:
if v in os.environ:
mpi_command += " -x {}".format(v)
for name, value in self.env.to_env_vars().items():
mpi_command += ' -x {}="{}"'.format(name, value)
mpi_command += " {}".format(_MPI_SCRIPT)
return mpi_command