def Run()

in perfkitbenchmarker/linux_benchmarks/resnet_benchmark.py [0:0]


def Run(benchmark_spec):
  """Run ResNet on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
      required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
  _UpdateBenchmarkSpecWithFlags(benchmark_spec)
  vm = benchmark_spec.vms[0]
  if benchmark_spec.tpus:
    resnet_benchmark_script = 'resnet_main.py'
    resnet_benchmark_cmd = (
        '{env_cmd} && '
        'cd tpu/models && '
        'export PYTHONPATH=$(pwd) &&'
        'cd official/resnet && '
        'python {script} '
        '--use_tpu={use_tpu} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--resnet_depth={depth} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--iterations_per_loop={iterations} '
        '--data_format={data_format} '
        '--precision={precision} '
        '--skip_host_call={skip_host_call} '
        '--num_train_images={num_train_images} '
        '--num_eval_images={num_eval_images}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=resnet_benchmark_script,
            use_tpu=bool(benchmark_spec.tpus),
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            depth=benchmark_spec.depth,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            iterations=benchmark_spec.iterations,
            data_format=benchmark_spec.data_format,
            precision=benchmark_spec.precision,
            skip_host_call=benchmark_spec.skip_host_call,
            num_train_images=benchmark_spec.num_train_images,
            num_eval_images=benchmark_spec.num_eval_images,
        )
    )
  else:
    resnet_benchmark_script = 'imagenet_main.py'
    resnet_benchmark_cmd = (
        '{env_cmd} && '
        'cd models && '
        'export PYTHONPATH=$(pwd) && '
        'cd official/r1/resnet && '
        'python {script} '
        '--data_dir=/data/imagenet '
        '--model_dir={model_dir} '
        '--resnet_size={resnet_size} '
        '--batch_size={batch_size} '
        '--data_format={data_format} '.format(
            env_cmd=benchmark_spec.env_cmd,
            script=resnet_benchmark_script,
            model_dir=benchmark_spec.model_dir,
            resnet_size=benchmark_spec.depth,
            batch_size=benchmark_spec.train_batch_size,
            data_format=benchmark_spec.data_format,
        )
    )
    precision = '{precision}'.format(precision=benchmark_spec.precision)
    if precision == 'bfloat16':
      resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
          cmd=resnet_benchmark_cmd
      )
    else:
      resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
          cmd=resnet_benchmark_cmd
      )

    if nvidia_driver.CheckNvidiaGpuExists(vm):
      resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
          env=tensorflow.GetEnvironmentVars(vm),
          cmd=resnet_benchmark_cmd,
          num_gpus=nvidia_driver.QueryNumberOfGpus(vm),
      )

  samples = []
  metadata = _CreateMetadataDict(benchmark_spec)
  elapsed_seconds = 0
  steps_per_eval = benchmark_spec.steps_per_eval
  train_steps = benchmark_spec.train_steps
  for step in range(
      steps_per_eval, train_steps + steps_per_eval, steps_per_eval
  ):
    step = min(step, train_steps)
    resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
        cmd=resnet_benchmark_cmd, step=step
    )

    if benchmark_spec.mode in ('train', 'train_and_eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['train'].GetNumShards()
        )
        resnet_benchmark_train_cmd = (
            '{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores
            )
        )
      else:
        resnet_benchmark_train_cmd = (
            '{cmd} --max_train_steps={max_train_steps} '
            '--train_epochs={train_epochs} --noeval_only'.format(
                cmd=resnet_benchmark_cmd,
                train_epochs=benchmark_spec.epochs_per_eval,
                max_train_steps=step,
            )
        )

      start = time.time()
      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd)
      elapsed_seconds += time.time() - start
      samples.extend(
          mnist_benchmark.MakeSamplesFromTrainOutput(
              metadata, stdout + stderr, elapsed_seconds, step
          )
      )

    if benchmark_spec.mode in ('train_and_eval', 'eval'):
      if benchmark_spec.tpus:
        tpu = benchmark_spec.tpu_groups['eval'].GetName()
        num_cores = '--num_cores={}'.format(
            benchmark_spec.tpu_groups['eval'].GetNumShards()
        )
        resnet_benchmark_eval_cmd = (
            '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
                cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores
            )
        )
      else:
        resnet_benchmark_eval_cmd = '{cmd} --eval_only'.format(
            cmd=resnet_benchmark_cmd
        )

      stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd)
      samples.extend(
          MakeSamplesFromEvalOutput(
              metadata,
              stdout + stderr,
              elapsed_seconds,
              use_tpu=bool(benchmark_spec.tpus),
          )
      )
  return samples