in src/sagemaker_training/mpi.py [0:0]
def _create_command(self):
num_hosts = len(self._hosts)
num_processes = self._num_processes or self._processes_per_host * num_hosts
# By default, use one process per GPU, or one process per node (if training with CPU).
if self._processes_per_host == 1:
host_list = self._hosts
else:
host_list = ["%s:%s" % (host, self._processes_per_host) for host in self._hosts]
msg = "Env Hosts: %s Hosts: %s process_per_hosts: %s num_processes: %s"
logger.info(msg, self._hosts, host_list, self._processes_per_host, num_processes)
overridden_known_options, additional_options = _parse_custom_mpi_options(
self._custom_mpi_options
)
logger.info("Network interface name: %s" % self._network_interface_name)
command = [
"mpirun",
"--host",
",".join(host_list),
"-np",
str(num_processes),
"--allow-run-as-root",
"--display-map",
"--tag-output",
"-mca",
"btl_tcp_if_include",
self._network_interface_name,
"-mca",
"oob_tcp_if_include",
self._network_interface_name,
"-mca",
"plm_rsh_no_tree_spawn",
"1",
"-bind-to",
"none",
"-map-by",
"slot",
"-mca",
"pml",
"ob1",
"-mca",
"btl",
"^openib",
"-mca",
"orte_abort_on_non_zero_status",
"1",
"-mca",
"btl_vader_single_copy_mechanism",
"none",
"-x",
"NCCL_MIN_NRINGS=4",
"-x",
"NCCL_SOCKET_IFNAME=%s" % self._network_interface_name,
"-x",
"NCCL_DEBUG=%s" % overridden_known_options.NCCL_DEBUG,
"-x",
"LD_LIBRARY_PATH",
"-x",
"PATH",
"-x",
"LD_PRELOAD=%s" % getfile(gethostname),
]
command.extend(additional_options)
# EFA settings
if self._instance_type in SM_EFA_NCCL_INSTANCES:
# Enable EFA use
command.extend(["-x", "FI_PROVIDER=efa"])
# Use simple protocol to handle the out-of-order data delivery from EFA
command.extend(["-x", "NCCL_PROTO=simple"])
if self._instance_type in SM_EFA_RDMA_INSTANCES:
# Use EFA's RDMA functionality for one-sided and two-sided transfer
command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"])
for credential in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
]:
if credential in os.environ:
command.extend(["-x", credential])
for name in self._env_vars:
command.extend(["-x", name])
command.extend(_modelparallel_environment_command(self._instance_type))
command.extend(super(MasterRunner, self)._create_command())
return command