in perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py [0:0]
def Run(benchmark_spec):
"""Run MLPerf on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of sample.Sample objects.
"""
_UpdateBenchmarkSpecWithFlags(benchmark_spec)
vm = benchmark_spec.vms[0]
if benchmark_spec.tpus:
# For MLPerf 1.0, the benchmake code of different hardware are different.
if (
benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'
):
run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark
mlperf_benchmark_cmd = (
'cd {code_path} && '
'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
'cd {model} && '
'{run_path}/run_and_time.sh'.format(
code_path=code_path, model=model, run_path=run_path
)
)
if SSD in benchmark_spec.benchmark:
mlperf_benchmark_cmd = (
'export MLP_GCS_RESNET_CHECKPOINT={checkpoint} && {cmd}'.format(
checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint,
cmd=mlperf_benchmark_cmd,
)
)
else:
raise ValueError(
'MLPerf configurations do not support the hardware in PKB. PKB may '
'need to be updated if this is a new TPU type.'
)
else:
run_sub_paths = {
RESNET: 'resnet/implementations/mxnet',
TRANSFORMER: 'transformer/implementations/pytorch',
MINIGO: 'minigo/implementations/tensorflow',
MASK: 'maskrcnn/implementations/pytorch',
GNMT: 'gnmt/implementations/pytorch',
SSD: 'ssd/implementations/pytorch',
BERT: 'bert/implementations/pytorch',
}
benchmark_path = f'$HOME/training_results_{VERSION.value}/NVIDIA/benchmarks'
run_path = posixpath.join(
benchmark_path, run_sub_paths[benchmark_spec.benchmark]
)
env = {
'DGXSYSTEM': DGXSYSTEM,
'NEXP': 1,
'PULL': 0,
'LOGDIR': f'/tmp/{benchmark_spec.benchmark}',
}
envs = {
RESNET: {},
TRANSFORMER: {'DATADIR': '/data/wmt/utf8'},
MINIGO: {'CONT': 'mlperf-nvidia:minigo'},
MASK: {},
GNMT: {'DATADIR': '/data/gnmt'},
SSD: {'DATADIR': '/data'},
BERT: {},
}
env.update(envs[benchmark_spec.benchmark])
run_script = posixpath.join(run_path, 'run_with_docker.sh')
vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t', run_script)
if benchmark_spec.benchmark == RESNET:
vm_util.ReplaceText(
vm, r'mpirun.*run_and_time\.sh', r'.\/run_and_time.sh', run_script
)
env = ' '.join(f'{key}={value}' for key, value in env.items())
if nvidia_driver.CheckNvidiaGpuExists(vm):
env = f'{tensorflow.GetEnvironmentVars(vm)} {env}'
mlperf_benchmark_cmd = (
f'chmod 755 {run_script} && cd {run_path} && {env} {run_script}'
)
samples = []
metadata = _CreateMetadataDict(benchmark_spec)
stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd)
if NONE in FLAGS.mlperf_profiler:
samples.extend(
MakeSamplesFromOutput(
metadata,
stdout,
use_tpu=bool(benchmark_spec.tpus),
model=benchmark_spec.benchmark,
)
)
return samples