def Run()

in perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py [0:0]


def Run(benchmark_spec):
  """Run MLPerf on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    # For MLPerf 1.0, the benchmake code of different hardware are different.
    if (
        benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
        or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128'
        or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256'
        or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512'
        or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024'
        or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'
    ):
      run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format(
          version=VERSION.value,
          model=benchmark_spec.benchmark,
          tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
      )
      code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format(
          version=VERSION.value,
          model=benchmark_spec.benchmark,
          tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
      )

      if MASK in benchmark_spec.benchmark:
        model = 'mask_rcnn'
      elif GNMT in benchmark_spec.benchmark:
        model = 'nmt'
      else:
        model = benchmark_spec.benchmark

      mlperf_benchmark_cmd = (
          'cd {code_path} && '
          'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
          'cd {model} && '
          '{run_path}/run_and_time.sh'.format(
              code_path=code_path, model=model, run_path=run_path
          )
      )

      if SSD in benchmark_spec.benchmark:
        mlperf_benchmark_cmd = (
            'export MLP_GCS_RESNET_CHECKPOINT={checkpoint} && {cmd}'.format(
                checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint,
                cmd=mlperf_benchmark_cmd,
            )
        )
    else:
      raise ValueError(
          'MLPerf configurations do not support the hardware in PKB. PKB may '
          'need to be updated if this is a new TPU type.'
      )

  else:
    run_sub_paths = {
        RESNET: 'resnet/implementations/mxnet',
        TRANSFORMER: 'transformer/implementations/pytorch',
        MINIGO: 'minigo/implementations/tensorflow',
        MASK: 'maskrcnn/implementations/pytorch',
        GNMT: 'gnmt/implementations/pytorch',
        SSD: 'ssd/implementations/pytorch',
        BERT: 'bert/implementations/pytorch',
    }
    benchmark_path = f'$HOME/training_results_{VERSION.value}/NVIDIA/benchmarks'
    run_path = posixpath.join(
        benchmark_path, run_sub_paths[benchmark_spec.benchmark]
    )
    env = {
        'DGXSYSTEM': DGXSYSTEM,
        'NEXP': 1,
        'PULL': 0,
        'LOGDIR': f'/tmp/{benchmark_spec.benchmark}',
    }
    envs = {
        RESNET: {},
        TRANSFORMER: {'DATADIR': '/data/wmt/utf8'},
        MINIGO: {'CONT': 'mlperf-nvidia:minigo'},
        MASK: {},
        GNMT: {'DATADIR': '/data/gnmt'},
        SSD: {'DATADIR': '/data'},
        BERT: {},
    }
    env.update(envs[benchmark_spec.benchmark])

    run_script = posixpath.join(run_path, 'run_with_docker.sh')
    vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
    vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t', run_script)
    if benchmark_spec.benchmark == RESNET:
      vm_util.ReplaceText(
          vm, r'mpirun.*run_and_time\.sh', r'.\/run_and_time.sh', run_script
      )

    env = ' '.join(f'{key}={value}' for key, value in env.items())
    if nvidia_driver.CheckNvidiaGpuExists(vm):
      env = f'{tensorflow.GetEnvironmentVars(vm)} {env}'

    mlperf_benchmark_cmd = (
        f'chmod 755 {run_script} && cd {run_path} && {env} {run_script}'
    )

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd)
  if NONE in FLAGS.mlperf_profiler:
    samples.extend(
        MakeSamplesFromOutput(
            metadata,
            stdout,
            use_tpu=bool(benchmark_spec.tpus),
            model=benchmark_spec.benchmark,
        )
    )
  return samples