in perfkitbenchmarker/linux_benchmarks/mlperf_multiworkers_benchmark.py [0:0]
def Run(benchmark_spec):
"""Run MLPerf on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of sample.Sample objects.
"""
_UpdateBenchmarkSpecWithFlags(benchmark_spec)
vms = benchmark_spec.vms
master_vm = vms[0]
benchmark = benchmark_spec.benchmark
env_params = {}
env_params['SLURM_JOB_ID'] = r'{uri}'.format(uri=FLAGS.run_uri)
env_params['PULL'] = 0
env_params['DGXSYSTEM'] = DGXSYSTEM
env_params['NEXP'] = 1
env_params['LOGDIR'] = posixpath.join(vm_util.VM_TMP_DIR, benchmark)
script_path = (
'$HOME/training_results_{version}/NVIDIA/benchmarks/{model}'
r'/implementations/{framework}'.format(
version=FLAGS.mlperf_training_version,
model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark,
framework='mxnet'
if mlperf_benchmark.RESNET in benchmark
else 'pytorch',
)
)
benchmark_env_params = {
mlperf_benchmark.TRANSFORMER: {
'CONT': r'"mlperf-nvidia:translation"',
'DATADIR': r'/data/wmt/utf8',
},
mlperf_benchmark.SSD: {
'CONT': r'"mlperf-nvidia:single_stage_detector"',
'DATADIR': '/data',
},
mlperf_benchmark.GNMT: {
'CONT': r'"mlperf-nvidia:rnn_translator"',
'DATADIR': r'/data/gnmt',
},
mlperf_benchmark.MASK: {},
mlperf_benchmark.RESNET: {},
mlperf_benchmark.BERT: {},
}
env_params.update(benchmark_env_params.get(benchmark, {}))
if mlperf_benchmark.RESNET in benchmark:
env_params['SLURM_JOB_NUM_NODES'] = benchmark_spec.num_vms
env = r''
if nvidia_driver.CheckNvidiaGpuExists(master_vm):
env = tensorflow.GetEnvironmentVars(master_vm)
cmd = (
f'cd {script_path} && '
f'{env} {_DictToString(env_params)} '
f'{FLAGS.nccl_mpi} '
'--allow-run-as-root '
'-hostfile $HOME/HOSTFILE '
'--mca pml ^cm '
'--mca btl tcp,self '
'--mca btl_tcp_if_exclude docker0,lo '
'--bind-to none '
'-N 1 '
'./run_with_docker1.sh'
)
if (
mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
or FLAGS.mlperf_keep_nccl_log
):
cmd += r' && cp /tmp/pkb/cmd* {logdir}'.format(
logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark)
)
samples = []
metadata = _CreateMetadataDict(benchmark_spec)
stdout, _ = master_vm.RobustRemoteCommand(cmd)
if mlperf_benchmark.NONE in FLAGS.mlperf_profiler:
samples.extend(MakeSamplesFromOutput(metadata, stdout, model=benchmark))
if (
mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
or FLAGS.mlperf_keep_nccl_log
):
master_vm.RemoteCommand(
r'mkdir -p /data/aggregated/{model}'.format(model=benchmark)
)
master_vm.RemoteCommand(
r'mpirun -hostfile $HOME/{hostfile} -N 1 scp -r {logdir} '
r'{master_ip}:/data/aggregated/'.format(
hostfile=HOSTFILE,
logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark),
master_ip=master_vm.internal_ip,
)
)
return samples