scripts/run_in_cluster.py (31 lines of code) (raw):

#!/usr/bin/python3 """ Usage [{filename}]: 1. Change IMAGE_NAME from DUMMY to your real image name 2. Change IPLIST from DUMMY to your real /path/to/iplist (absolute path) 3. Change AICB_DIR from DUMMY to your real /path/to/aicb (absolute path) 4. Change the settings in run_suites.py to select the workload you want 5. Copy iplist and aicb to all participating servers at /path/to/iplist and /path/to/aicb, e.g., using `pscp` command like `pscp.pssh -h iplist iplist /path/to/iplist` and `pscp.pssh -h iplist -r aicb /path/to/aicb` 6. Run simulation on all participating servers, e.g., using `pssh` command like `pssh -i -h /path/to/iplist -o out -e err -t 0 "cd /path/to/aicb && python scripts/run_in_cluster.py"` """ import subprocess import os import re import sys filename = os.path.basename(__file__) __doc__ = __doc__.format(filename=filename) def get_local_ip(): output = os.popen("ifconfig").read().strip() pattern = r"inet (\d+.\d+.\d+.\d+) " return re.findall(pattern, output) def get_world_id_list(filename): with open(filename, "r") as f: return f.read().strip().split("\n") def get_docker_env_rank(filename): ip_list = get_world_id_list(filename) local_ip = get_local_ip() for ip in local_ip: if ip in ip_list: return len(ip_list), ip_list.index(ip), ip_list[0], 12345 return -1, -1, -1, -1 IPLIST = "DUMMY_IPLIST" # Change it to /path/to/iplist, e.g., /root/iplist AICB_DIR = "DUMMY_AICB_DIR" # Change it to /path/to/aicb, e.g., /root/aicb IMAGE_NAME = "DUMMY_IMAGE_NAME" # Change it to your docker image name, e.g., nvcr.io/nvidia/pytorch:xx.xx-py3 if IPLIST == "DUMMY_IPLIST" or AICB_DIR == "DUMMY_AICB_DIR" or IMAGE_NAME == "DUMMY_IMAGE_NAME": sys.stderr.write(__doc__) sys.exit(1) WORLD_SIZE, RANK, MASTER_ADDR, MASTER_PORT = get_docker_env_rank(IPLIST) AICB_DIR_base = os.path.basename(AICB_DIR) command = f"""docker run --name aicb_test --gpus all --privileged \ --ulimit memlock=-1 --ulimit stack=67108864 \ --init -i --shm-size=4g --network=host --rm \ -e WORLD_SIZE={WORLD_SIZE} \ -e RANK={RANK} \ -e MASTER_ADDR={MASTER_ADDR} \ -e MASTER_PORT={MASTER_PORT} \ -v {AICB_DIR}:/workspace/{AICB_DIR_base} \ {IMAGE_NAME} /bin/sh -c 'cd /workspace/{AICB_DIR_base} && pwd && python run_suites.py' """ # Change the settings in run_suites.py to select the workload you want ret = subprocess.run(command, shell=True) print(ret)