in perfkitbenchmarker/linux_benchmarks/resnet_benchmark.py [0:0]
def Run(benchmark_spec):
"""Run ResNet on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of sample.Sample objects.
"""
_UpdateBenchmarkSpecWithFlags(benchmark_spec)
vm = benchmark_spec.vms[0]
if benchmark_spec.tpus:
resnet_benchmark_script = 'resnet_main.py'
resnet_benchmark_cmd = (
'{env_cmd} && '
'cd tpu/models && '
'export PYTHONPATH=$(pwd) &&'
'cd official/resnet && '
'python {script} '
'--use_tpu={use_tpu} '
'--data_dir={data_dir} '
'--model_dir={model_dir} '
'--resnet_depth={depth} '
'--train_batch_size={train_batch_size} '
'--eval_batch_size={eval_batch_size} '
'--iterations_per_loop={iterations} '
'--data_format={data_format} '
'--precision={precision} '
'--skip_host_call={skip_host_call} '
'--num_train_images={num_train_images} '
'--num_eval_images={num_eval_images}'.format(
env_cmd=benchmark_spec.env_cmd,
script=resnet_benchmark_script,
use_tpu=bool(benchmark_spec.tpus),
data_dir=benchmark_spec.data_dir,
model_dir=benchmark_spec.model_dir,
depth=benchmark_spec.depth,
train_batch_size=benchmark_spec.train_batch_size,
eval_batch_size=benchmark_spec.eval_batch_size,
iterations=benchmark_spec.iterations,
data_format=benchmark_spec.data_format,
precision=benchmark_spec.precision,
skip_host_call=benchmark_spec.skip_host_call,
num_train_images=benchmark_spec.num_train_images,
num_eval_images=benchmark_spec.num_eval_images,
)
)
else:
resnet_benchmark_script = 'imagenet_main.py'
resnet_benchmark_cmd = (
'{env_cmd} && '
'cd models && '
'export PYTHONPATH=$(pwd) && '
'cd official/r1/resnet && '
'python {script} '
'--data_dir=/data/imagenet '
'--model_dir={model_dir} '
'--resnet_size={resnet_size} '
'--batch_size={batch_size} '
'--data_format={data_format} '.format(
env_cmd=benchmark_spec.env_cmd,
script=resnet_benchmark_script,
model_dir=benchmark_spec.model_dir,
resnet_size=benchmark_spec.depth,
batch_size=benchmark_spec.train_batch_size,
data_format=benchmark_spec.data_format,
)
)
precision = '{precision}'.format(precision=benchmark_spec.precision)
if precision == 'bfloat16':
resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format(
cmd=resnet_benchmark_cmd
)
else:
resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format(
cmd=resnet_benchmark_cmd
)
if nvidia_driver.CheckNvidiaGpuExists(vm):
resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format(
env=tensorflow.GetEnvironmentVars(vm),
cmd=resnet_benchmark_cmd,
num_gpus=nvidia_driver.QueryNumberOfGpus(vm),
)
samples = []
metadata = _CreateMetadataDict(benchmark_spec)
elapsed_seconds = 0
steps_per_eval = benchmark_spec.steps_per_eval
train_steps = benchmark_spec.train_steps
for step in range(
steps_per_eval, train_steps + steps_per_eval, steps_per_eval
):
step = min(step, train_steps)
resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
cmd=resnet_benchmark_cmd, step=step
)
if benchmark_spec.mode in ('train', 'train_and_eval'):
if benchmark_spec.tpus:
tpu = benchmark_spec.tpu_groups['train'].GetName()
num_cores = '--num_cores={}'.format(
benchmark_spec.tpu_groups['train'].GetNumShards()
)
resnet_benchmark_train_cmd = (
'{cmd} --tpu={tpu} --mode=train {num_cores}'.format(
cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores
)
)
else:
resnet_benchmark_train_cmd = (
'{cmd} --max_train_steps={max_train_steps} '
'--train_epochs={train_epochs} --noeval_only'.format(
cmd=resnet_benchmark_cmd,
train_epochs=benchmark_spec.epochs_per_eval,
max_train_steps=step,
)
)
start = time.time()
stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd)
elapsed_seconds += time.time() - start
samples.extend(
mnist_benchmark.MakeSamplesFromTrainOutput(
metadata, stdout + stderr, elapsed_seconds, step
)
)
if benchmark_spec.mode in ('train_and_eval', 'eval'):
if benchmark_spec.tpus:
tpu = benchmark_spec.tpu_groups['eval'].GetName()
num_cores = '--num_cores={}'.format(
benchmark_spec.tpu_groups['eval'].GetNumShards()
)
resnet_benchmark_eval_cmd = (
'{cmd} --tpu={tpu} --mode=eval {num_cores}'.format(
cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores
)
)
else:
resnet_benchmark_eval_cmd = '{cmd} --eval_only'.format(
cmd=resnet_benchmark_cmd
)
stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd)
samples.extend(
MakeSamplesFromEvalOutput(
metadata,
stdout + stderr,
elapsed_seconds,
use_tpu=bool(benchmark_spec.tpus),
)
)
return samples