in perfkitbenchmarker/linux_benchmarks/mlperf_multiworkers_benchmark.py [0:0]
def _UpdateScripts(benchmark_spec, node_rank):
"""Update the running scripts on the target vm.
Args:
benchmark_spec: The benchmark specification.
node_rank: int, The rank of the node for multi-node distributed training
"""
vm = benchmark_spec.vms[node_rank]
benchmark = benchmark_spec.benchmark
# TODO(tohaowu) Change config and script using a patch file.
# request pairs to the sed command
# each pair('str_A', 'str_B') indicates a request "replace anything
# matching str_A to str_B" for a specific file
config_sed = []
config_sed += [(r'DGXSYSTEM=.*', rf'DGXSYSTEM=\"{DGXSYSTEM}\"')]
config_sed += [(
r'DGXNNODES=.*',
r'DGXNNODES={num_vms}'.format(num_vms=benchmark_spec.num_vms),
)]
config_sed += [(
r'DGXNGPU=.*',
(
rf'DGXNGPU={benchmark_spec.gpus_per_vm}\nexport'
rf' CUDA_VISIBLE_DEVICES={",".join([str(i) for i in range(benchmark_spec.gpus_per_vm)])}'
),
)]
config_sed += [(
r'DGXNSOCKET=.*',
r'DGXNSOCKET={nsockets}'.format(nsockets=vm.CheckLsCpu().socket_count),
)]
config_sed += [(
r'DGXSOCKETCORES=.*',
r'DGXSOCKETCORES={ncores}'.format(
ncores=vm.CheckLsCpu().cores_per_socket
),
)]
run_and_time_sed = []
run_and_time_sed += [(r'run_training.sh', r'run_training1.sh')]
run_and_time_sed += [(r'DGXSYSTEM=.*', rf'DGXSYSTEM=\"{DGXSYSTEM}\"')]
if FLAGS.mlperf_keep_nccl_log:
run_and_time_sed += [(
r'#\!\/bin\/bash',
(
r'#\!\/bin\/bash\n'
r'export NCCL_DEBUG=INFO\n'
r'export NCCL_DEBUG_SUBSYS=ALL\n'
r'export NCCL_DEBUG_FILE=\/results\/%h.%p.nccl'
),
)]
nccl_exports = _GetNcclParams() if FLAGS.nccl_extra_params else r''
run_and_time_sed += [(
r'#!\/bin\/bash',
r'#!\/bin\/bash\n' rf'{nccl_exports}',
)]
run_sed = []
run_sed += [(r'SYSLOGGING=1', r'SYSLOGGING=0')]
run_sed += [(
r'env [|] grep SLURM',
r'export SLURM_NNODES={num_vms}'.format(num_vms=benchmark_spec.num_vms),
)]
run_sed += [(
r'data -v \$LOGDIR',
r'data -v \$(pwd):\/workspace\/{model}1 -v \$LOGDIR'.format(
model=benchmark
),
)]
run_sed += [(
r'scontrol show hostname',
r'mpirun -hostfile \$HOME\/{hostfile} -N 1 hostname -I '
r'\| awk \'{{print \$1}}\' '.format(hostfile=HOSTFILE),
)]
run_sed += [(
r'srun --mem=0 -N 1 -n 1 -w \$hostn',
r'mpirun -N 1 -n 1 -H \$hostn',
)]
run_sed += [(r'sleep 30', r'sleep 60')]
run_sed += [(r'docker exec -it', r'docker exec -t')]
run_sed += [(r'run_and_time.sh', r'run_and_time1.sh')]
if FLAGS.aws_efa or FLAGS.azure_infiniband:
stdout, _ = vm.RemoteCommand('ls -d /dev/infiniband/*')
devices = [device.replace('/', '\\/') for device in stdout.split()]
device_args = ' '.join(f'--device={device}' for device in devices)
run_sed += [(r'nvidia-docker run', rf'nvidia-docker run {device_args}')]
if FLAGS.azure_infiniband:
run_sed.append((
r'_cont_mounts=(',
r'_cont_mounts=(\"--volume=\/opt\/microsoft:\/opt\/microsoft\" ',
))
run_sed.append((
r'^CONT_MOUNTS=\(.*\)$',
r'CONT_MOUNTS=\"\1 --volume=\/opt\/microsoft:\/opt\/microsoft\"',
))
nvprof_flags = r'-f -o \/results\/%h.%p.nvprof --profile-child-processes'
script_path = (
r'$HOME/training_results_{version}/NVIDIA/benchmarks/{model}'
r'/implementations/{framework}'.format(
version=FLAGS.mlperf_training_version,
model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark,
framework='mxnet'
if mlperf_benchmark.RESNET in benchmark
else 'pytorch',
)
)
config_files = [CONFIG]
if mlperf_benchmark.TRANSFORMER in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForTransformer(
benchmark_spec,
vm,
script_path,
nvprof_flags,
config_sed,
run_sed,
run_and_time_sed,
)
elif mlperf_benchmark.SSD in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForSSD(
benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed
)
elif mlperf_benchmark.GNMT in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForGNMT(
benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed
)
elif mlperf_benchmark.MASK in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForMask(
benchmark_spec,
node_rank,
script_path,
nvprof_flags,
config_sed,
run_sed,
run_and_time_sed,
)
config_files = ['config_DGXA100_multi_8x8x2.sh']
elif mlperf_benchmark.RESNET in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForResnet(
benchmark_spec,
node_rank,
nvprof_flags,
config_sed,
run_sed,
run_and_time_sed,
)
config_files = ['config_DGXA100_common.sh', 'config_DGXA100_multi_8x8x*.sh']
mlperf_benchmark.UpdateScriptForSmallGpuMem(vm)
elif mlperf_benchmark.BERT in benchmark:
config_sed, run_sed, run_and_time_sed = _GetChangesForBert(
benchmark_spec,
node_rank,
nvprof_flags,
config_sed,
run_sed,
run_and_time_sed,
)
config_files = ['config_DGXA100_common.sh', 'config_DGXA100_8x8x48x1.sh']
vm.RemoteCommand(
f'cd {script_path} && '
f'sed "{mlperf_benchmark.SedPairsToString(config_sed)}" '
f'{" ".join(config_files)} > {CONFIG} && '
f'chmod 755 {CONFIG} '
)
vm.RemoteCommand(
f'cd {script_path} && '
f'sed "{mlperf_benchmark.SedPairsToString(run_and_time_sed)}" '
f'run_and_time.sh | sed "2 i source {CONFIG}" > run_and_time1.sh && '
'chmod 755 run_and_time1.sh '
)
vm.RemoteCommand(
f'cd {script_path} && '
f'sed "{mlperf_benchmark.SedPairsToString(run_sed)}" run_with_docker.sh '
f'| sed "2 i source {CONFIG}" > run_with_docker1.sh && '
'chmod 755 run_with_docker1.sh'
)
docker_file = posixpath.join(script_path, 'Dockerfile')
if FLAGS.nccl_net_plugin:
vm_util.ReplaceText(
vm,
'RUN apt-get update',
r'RUN echo \"deb https:\/\/packages.cloud.google.com\/apt '
r'google-fast-socket main\" | '
r'tee \/etc\/apt\/sources.list.d\/google-fast-socket.list\n'
r'RUN curl -s -L '
r'https:\/\/packages.cloud.google.com\/apt\/doc\/apt-key.gpg | '
r'apt-key add -\n'
r'RUN rm -f \/opt\/hpcx\/nccl_rdma_sharp_plugin\/lib\/libnccl-net.so\n'
r'RUN apt-get update',
docker_file,
)
vm_util.ReplaceText(
vm,
'apt-get install -y --no-install-recommends',
'apt-get install -y --no-install-recommends google-fast-socket',
docker_file,
)
if FLAGS.aws_efa:
vm.RemoteCommand(f'git clone {AWS_EFA_NCCL_BASEAMI_PIPELINE_URL}')
vm.RemoteCommand(
'cd aws-efa-nccl-baseami-pipeline && git checkout'
f' {NVIDIA_EFA_DOCKERFILE_COMMIT}'
)
vm.RemoteCommand(f'cat {NVIDIA_EFA_DOCKERFILE} >> {docker_file}')
vm.RemoteCommand(
'echo "RUN rm -rf /opt/hpcx/ompi && ln -s /opt/amazon/openmpi'
f' /opt/hpcx/ompi" >> {docker_file}'
)
vm_util.ReplaceText(vm, 'FROM nvcr.*', '', docker_file)
vm_util.ReplaceText(vm, 'yum-utils.*', '', docker_file)
vm_util.ReplaceText(
vm, 'python3-distutils.*', 'python3-distutils', docker_file
)
vm_util.ReplaceText(vm, 'cmake', '', docker_file)