in common/sagemaker_rl/mpi_launcher.py [0:0]
def _build_mpi_command(self):
"""Build MPI command."""
num_hosts = len(self.env.hosts)
num_processes = self.process_per_host * num_hosts
# By default, use one process per GPU, or one process per node (if training with CPU).
host_list = (
self.env.hosts
if self.process_per_host == 1
else [host + ":{}".format(self.process_per_host) for host in self.env.hosts]
)
print(
"Env Hosts: {} Hosts: {} process_per_hosts: {} num_processes: {}".format(
self.env.hosts, host_list, self.process_per_host, num_processes
)
)
credential_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"]
interface_name = interface_name = self.env.network_interface_name
if self.instance_type == "local":
interface_name = "eth0"
print("network interface name:" + interface_name + " " + str(self.instance_type))
mpi_command = (
"mpirun --host {}".format(",".join(host_list))
+ " -np {} ".format(num_processes)
+ " --allow-run-as-root"
+ " --display-map"
+ " --tag-output"
+ " -mca btl_tcp_if_include {}".format(interface_name)
+ " -mca oob_tcp_if_include {}".format(interface_name)
+ " -x NCCL_SOCKET_IFNAME={}".format(interface_name)
+ " --mca plm_rsh_no_tree_spawn 1"
+ " -mca orte_abort_on_non_zero_status 1"
+ " -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO"
+ " -x LD_LIBRARY_PATH -x PATH"
+ " -x LD_PRELOAD={}".format(_CHANGE_HOSTNAME_LIBRARY)
)
for v in credential_vars:
if v in os.environ:
mpi_command += " -x {}".format(v)
for name, value in self.env.to_env_vars().items():
mpi_command += ' -x {}="{}"'.format(name, value)
mpi_command += " {}".format(_MPI_SCRIPT)
return mpi_command