def Run()

in perfkitbenchmarker/linux_benchmarks/mlperf_multiworkers_benchmark.py [0:0]


def Run(benchmark_spec):
  """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vms = benchmark_spec.vms
  master_vm = vms[0]
  benchmark = benchmark_spec.benchmark

  env_params = {}
  env_params['SLURM_JOB_ID'] = r'{uri}'.format(uri=FLAGS.run_uri)
  env_params['PULL'] = 0
  env_params['DGXSYSTEM'] = DGXSYSTEM
  env_params['NEXP'] = 1
  env_params['LOGDIR'] = posixpath.join(vm_util.VM_TMP_DIR, benchmark)

  script_path = (
      '$HOME/training_results_{version}/NVIDIA/benchmarks/{model}'
      r'/implementations/{framework}'.format(
          version=FLAGS.mlperf_training_version,
          model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark,
          framework='mxnet'
          if mlperf_benchmark.RESNET in benchmark
          else 'pytorch',
      )
  )

  benchmark_env_params = {
      mlperf_benchmark.TRANSFORMER: {
          'CONT': r'"mlperf-nvidia:translation"',
          'DATADIR': r'/data/wmt/utf8',
      },
      mlperf_benchmark.SSD: {
          'CONT': r'"mlperf-nvidia:single_stage_detector"',
          'DATADIR': '/data',
      },
      mlperf_benchmark.GNMT: {
          'CONT': r'"mlperf-nvidia:rnn_translator"',
          'DATADIR': r'/data/gnmt',
      },
      mlperf_benchmark.MASK: {},
      mlperf_benchmark.RESNET: {},
      mlperf_benchmark.BERT: {},
  }
  env_params.update(benchmark_env_params.get(benchmark, {}))
  if mlperf_benchmark.RESNET in benchmark:
    env_params['SLURM_JOB_NUM_NODES'] = benchmark_spec.num_vms

  env = r''
  if nvidia_driver.CheckNvidiaGpuExists(master_vm):
    env = tensorflow.GetEnvironmentVars(master_vm)

  cmd = (
      f'cd {script_path} && '
      f'{env} {_DictToString(env_params)} '
      f'{FLAGS.nccl_mpi} '
      '--allow-run-as-root '
      '-hostfile $HOME/HOSTFILE '
      '--mca pml ^cm '
      '--mca btl tcp,self '
      '--mca btl_tcp_if_exclude docker0,lo '
      '--bind-to none '
      '-N 1 '
      './run_with_docker1.sh'
  )
  if (
      mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
      or FLAGS.mlperf_keep_nccl_log
  ):
    cmd += r' && cp /tmp/pkb/cmd* {logdir}'.format(
        logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark)
    )

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  stdout, _ = master_vm.RobustRemoteCommand(cmd)
  if mlperf_benchmark.NONE in FLAGS.mlperf_profiler:
    samples.extend(MakeSamplesFromOutput(metadata, stdout, model=benchmark))

  if (
      mlperf_benchmark.NVPROF in FLAGS.mlperf_profiler
      or FLAGS.mlperf_keep_nccl_log
  ):
    master_vm.RemoteCommand(
        r'mkdir -p /data/aggregated/{model}'.format(model=benchmark)
    )
    master_vm.RemoteCommand(
        r'mpirun -hostfile $HOME/{hostfile} -N 1 scp -r {logdir} '
        r'{master_ip}:/data/aggregated/'.format(
            hostfile=HOSTFILE,
            logdir=posixpath.join(vm_util.VM_TMP_DIR, benchmark),
            master_ip=master_vm.internal_ip,
        )
    )

  return samples