def _create_command()

in src/sagemaker_training/mpi.py [0:0]


    def _create_command(self):
        num_hosts = len(self._hosts)
        num_processes = self._num_processes or self._processes_per_host * num_hosts

        # By default, use one process per GPU, or one process per node (if training with CPU).
        if self._processes_per_host == 1:
            host_list = self._hosts
        else:
            host_list = ["%s:%s" % (host, self._processes_per_host) for host in self._hosts]

        msg = "Env Hosts: %s Hosts: %s process_per_hosts: %s num_processes: %s"
        logger.info(msg, self._hosts, host_list, self._processes_per_host, num_processes)

        overridden_known_options, additional_options = _parse_custom_mpi_options(
            self._custom_mpi_options
        )

        logger.info("Network interface name: %s" % self._network_interface_name)

        command = [
            "mpirun",
            "--host",
            ",".join(host_list),
            "-np",
            str(num_processes),
            "--allow-run-as-root",
            "--display-map",
            "--tag-output",
            "-mca",
            "btl_tcp_if_include",
            self._network_interface_name,
            "-mca",
            "oob_tcp_if_include",
            self._network_interface_name,
            "-mca",
            "plm_rsh_no_tree_spawn",
            "1",
            "-bind-to",
            "none",
            "-map-by",
            "slot",
            "-mca",
            "pml",
            "ob1",
            "-mca",
            "btl",
            "^openib",
            "-mca",
            "orte_abort_on_non_zero_status",
            "1",
            "-mca",
            "btl_vader_single_copy_mechanism",
            "none",
            "-x",
            "NCCL_MIN_NRINGS=4",
            "-x",
            "NCCL_SOCKET_IFNAME=%s" % self._network_interface_name,
            "-x",
            "NCCL_DEBUG=%s" % overridden_known_options.NCCL_DEBUG,
            "-x",
            "LD_LIBRARY_PATH",
            "-x",
            "PATH",
            "-x",
            "LD_PRELOAD=%s" % getfile(gethostname),
        ]

        command.extend(additional_options)

        # EFA settings
        if self._instance_type in SM_EFA_NCCL_INSTANCES:
            # Enable EFA use
            command.extend(["-x", "FI_PROVIDER=efa"])
            # Use simple protocol to handle the out-of-order data delivery from EFA
            command.extend(["-x", "NCCL_PROTO=simple"])

        if self._instance_type in SM_EFA_RDMA_INSTANCES:
            # Use EFA's RDMA functionality for one-sided and two-sided transfer
            command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"])

        for credential in [
            "AWS_ACCESS_KEY_ID",
            "AWS_SECRET_ACCESS_KEY",
            "AWS_SESSION_TOKEN",
        ]:
            if credential in os.environ:
                command.extend(["-x", credential])

        for name in self._env_vars:
            command.extend(["-x", name])

        command.extend(_modelparallel_environment_command(self._instance_type))

        command.extend(super(MasterRunner, self)._create_command())
        return command