def _make_launch_docker_container_text()

in launcher/nemo/stages.py [0:0]


    def _make_launch_docker_container_text(self):
        """
        Creating a script to launch container on all nodes
        This will be called only when running docker container on Slurm cluster
        """
        launch_docker_container_text = ["#!/bin/bash", "set -ex"]
        image = self.cfg.container

        # Login ECR
        launch_docker_container_text.append(f'echo "image is {image}"')
        is_ecr_image = "amazonaws.com" in image
        if not is_ecr_image:
            launch_docker_container_text.append(f'echo "Not an ECR image, skipping ECR login"')
        else:
            # format will be account.dkr.ecr.region.amazonaws.com/repo:tag
            link = image.split("/")[0]
            region = link.split(".")[3]
            launch_docker_container_text.append(f"# Login ECR")
            launch_docker_container_text.append(
                f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {link}"
            )
            launch_docker_container_text.append("")

        # Handle EFA devices
        if get_num_efa_devices(self.instance_type) > 0:
            launch_docker_container_text.append(f"# Getting EFA devices")
            if allow_rdma(self.instance_type):
                launch_docker_container_text.append('device=("--device=/dev/gdrdrv")')
            else:
                launch_docker_container_text.append("device=()")
            launch_docker_container_text.extend(
                [
                    "while IFS= read -r -d '' d; do",
                    '  device+=("--device=${d}")',
                    'done < <(find "/dev/infiniband" -name "uverbs*" -print0)',
                ]
            )
            launch_docker_container_text.append("")

        # Clean old containers
        launch_docker_container_text.append(f"# Clean old containers")
        launch_docker_container_text.append(
            "docker ps -a --filter 'name="
            + CONTAINER_NAME
            + "' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true"
        )
        launch_docker_container_text.append(
            "docker ps -a --filter 'name=" + CONTAINER_NAME + "' --format '{{.ID}}' | xargs -I{} docker wait {} || true"
        )
        launch_docker_container_text.append("")

        # Pull new container
        launch_docker_container_text.append(f'docker pull "{image}"')

        # Docker run command
        launch_docker_container_text.extend(
            [
                f"docker run --gpus {get_ntasks_per_node(self.stage_cfg)} \\",
                f'  --privileged --rm -d --name "{CONTAINER_NAME}" \\',
                "  --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \\",
                "  --security-opt seccomp=unconfined  \\",
            ]
        )

        if get_num_efa_devices(self.instance_type) > 0:
            launch_docker_container_text.append('  "${device[@]}" \\')

        # Handle volume mounting
        mount_str = self._make_container_mounts_string()
        for mount in mount_str.split(","):
            launch_docker_container_text.append(f"  -v {mount} \\")

        # Handle user run args and post run commands
        post_launch_commands = []
        if OmegaConf.select(self.cfg, "cluster.slurm_docker_cfg", default=None) is not None:
            if self.cfg.cluster.slurm_docker_cfg.get("docker_args", None) is not None:
                user_arg = []
                for arg in self.cfg.cluster.slurm_docker_cfg.docker_args:
                    user_arg.append(arg)
                if len(user_arg) > 0:
                    user_arg = " ".join(user_arg)
                    launch_docker_container_text.append(f"  {user_arg} \\")
            if self.cfg.cluster.slurm_docker_cfg.get("post_launch_commands", None) is not None:
                for cmd in self.cfg.cluster.slurm_docker_cfg.post_launch_commands:
                    post_launch_commands.append(cmd)
            if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
                transformers_upgrade_cmd = "pip install transformers==4.45.2"
                post_launch_commands.append(transformers_upgrade_cmd)
            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "deepseek_r1":
                transformers_upgrade_cmd = "pip install transformers==4.48.2"
                post_launch_commands.append(transformers_upgrade_cmd)
            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
                transformers_upgrade_cmd = "pip install transformers==4.51.3"
                post_launch_commands.append(transformers_upgrade_cmd)

        launch_docker_container_text.append(f'  "{image}" sleep infinity')
        launch_docker_container_text.append("")

        # Allow containers to talk to each other
        launch_docker_container_text.append(f"# Running post launching commands")
        launch_docker_container_text.extend(
            [
                f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\"Port 2022\\n\\" >> /etc/ssh/sshd_config"',
                f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\"  Port 2022\\n\\" >> /root/.ssh/config"',
                f'docker exec -itd "{CONTAINER_NAME}" bash -c "service ssh start"',
            ]
        )
        for cmd in post_launch_commands:
            launch_docker_container_text.append(f'docker exec "{CONTAINER_NAME}" bash -c "{cmd}"')
        launch_docker_container_text.append("")

        # Exit
        launch_docker_container_text.append("exit 0")

        return "\n".join(launch_docker_container_text)