in launcher/nemo/stages.py [0:0]
def _make_launch_docker_container_text(self):
"""
Creating a script to launch container on all nodes
This will be called only when running docker container on Slurm cluster
"""
launch_docker_container_text = ["#!/bin/bash", "set -ex"]
image = self.cfg.container
# Login ECR
launch_docker_container_text.append(f'echo "image is {image}"')
is_ecr_image = "amazonaws.com" in image
if not is_ecr_image:
launch_docker_container_text.append(f'echo "Not an ECR image, skipping ECR login"')
else:
# format will be account.dkr.ecr.region.amazonaws.com/repo:tag
link = image.split("/")[0]
region = link.split(".")[3]
launch_docker_container_text.append(f"# Login ECR")
launch_docker_container_text.append(
f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {link}"
)
launch_docker_container_text.append("")
# Handle EFA devices
if get_num_efa_devices(self.instance_type) > 0:
launch_docker_container_text.append(f"# Getting EFA devices")
if allow_rdma(self.instance_type):
launch_docker_container_text.append('device=("--device=/dev/gdrdrv")')
else:
launch_docker_container_text.append("device=()")
launch_docker_container_text.extend(
[
"while IFS= read -r -d '' d; do",
' device+=("--device=${d}")',
'done < <(find "/dev/infiniband" -name "uverbs*" -print0)',
]
)
launch_docker_container_text.append("")
# Clean old containers
launch_docker_container_text.append(f"# Clean old containers")
launch_docker_container_text.append(
"docker ps -a --filter 'name="
+ CONTAINER_NAME
+ "' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true"
)
launch_docker_container_text.append(
"docker ps -a --filter 'name=" + CONTAINER_NAME + "' --format '{{.ID}}' | xargs -I{} docker wait {} || true"
)
launch_docker_container_text.append("")
# Pull new container
launch_docker_container_text.append(f'docker pull "{image}"')
# Docker run command
launch_docker_container_text.extend(
[
f"docker run --gpus {get_ntasks_per_node(self.stage_cfg)} \\",
f' --privileged --rm -d --name "{CONTAINER_NAME}" \\',
" --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \\",
" --security-opt seccomp=unconfined \\",
]
)
if get_num_efa_devices(self.instance_type) > 0:
launch_docker_container_text.append(' "${device[@]}" \\')
# Handle volume mounting
mount_str = self._make_container_mounts_string()
for mount in mount_str.split(","):
launch_docker_container_text.append(f" -v {mount} \\")
# Handle user run args and post run commands
post_launch_commands = []
if OmegaConf.select(self.cfg, "cluster.slurm_docker_cfg", default=None) is not None:
if self.cfg.cluster.slurm_docker_cfg.get("docker_args", None) is not None:
user_arg = []
for arg in self.cfg.cluster.slurm_docker_cfg.docker_args:
user_arg.append(arg)
if len(user_arg) > 0:
user_arg = " ".join(user_arg)
launch_docker_container_text.append(f" {user_arg} \\")
if self.cfg.cluster.slurm_docker_cfg.get("post_launch_commands", None) is not None:
for cmd in self.cfg.cluster.slurm_docker_cfg.post_launch_commands:
post_launch_commands.append(cmd)
if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
transformers_upgrade_cmd = "pip install transformers==4.45.2"
post_launch_commands.append(transformers_upgrade_cmd)
if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "deepseek_r1":
transformers_upgrade_cmd = "pip install transformers==4.48.2"
post_launch_commands.append(transformers_upgrade_cmd)
if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
transformers_upgrade_cmd = "pip install transformers==4.51.3"
post_launch_commands.append(transformers_upgrade_cmd)
launch_docker_container_text.append(f' "{image}" sleep infinity')
launch_docker_container_text.append("")
# Allow containers to talk to each other
launch_docker_container_text.append(f"# Running post launching commands")
launch_docker_container_text.extend(
[
f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\"Port 2022\\n\\" >> /etc/ssh/sshd_config"',
f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\" Port 2022\\n\\" >> /root/.ssh/config"',
f'docker exec -itd "{CONTAINER_NAME}" bash -c "service ssh start"',
]
)
for cmd in post_launch_commands:
launch_docker_container_text.append(f'docker exec "{CONTAINER_NAME}" bash -c "{cmd}"')
launch_docker_container_text.append("")
# Exit
launch_docker_container_text.append("exit 0")
return "\n".join(launch_docker_container_text)