perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py (744 lines of code) (raw):
# Copyright 2019 PerfKitBenchmarker Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run MLPerf benchmarks."""
import json
import posixpath
import re
from absl import flags
from perfkitbenchmarker import configs
from perfkitbenchmarker import errors
from perfkitbenchmarker import regex_util
from perfkitbenchmarker import sample
from perfkitbenchmarker import virtual_machine
from perfkitbenchmarker import vm_util
from perfkitbenchmarker.linux_packages import cuda_toolkit
from perfkitbenchmarker.linux_packages import docker
from perfkitbenchmarker.linux_packages import google_cloud_sdk
from perfkitbenchmarker.linux_packages import nvidia_driver
from perfkitbenchmarker.linux_packages import tensorflow
from perfkitbenchmarker.providers.gcp import gcs
from perfkitbenchmarker.providers.gcp import util
FLAGS = flags.FLAGS
BENCHMARK_NAME = 'mlperf'
BENCHMARK_CONFIG = """
mlperf:
description: Runs MLPerf Benchmark.
vm_groups:
default:
disk_spec: *default_500_gb
vm_spec:
GCP:
machine_type: a2-highgpu-8g
zone: us-central1-b
boot_disk_size: 200
AWS:
machine_type: p4d.24xlarge
zone: us-west-2a
boot_disk_size: 200
Azure:
machine_type: Standard_ND96asr_v4
zone: westus2
boot_disk_size: 200
image: microsoft-dsvm:ubuntu-hpc:1804:latest
"""
DGXSYSTEM = 'DGXA100_singlenode'
CONFIG = f'config_{DGXSYSTEM}.sh'
TRANSFORMER = 'transformer'
RESNET = 'resnet'
MASK = 'mask'
GNMT = 'gnmt'
SSD = 'ssd'
MINIGO = 'minigo'
BERT = 'bert'
DLRM = 'dlrm'
GPT3 = 'gpt3'
flags.DEFINE_enum(
'mlperf_benchmark',
RESNET,
[RESNET, TRANSFORMER, MASK, GNMT, SSD, MINIGO, BERT, DLRM, GPT3],
'MLPerf benchmark test to run.',
)
NVPROF = 'nvprof'
TFPROF = 'tfprof'
NONE = 'none'
flags.DEFINE_enum(
'mlperf_profiler',
NONE,
[NVPROF, TFPROF, NONE],
'profiler used to analysis GPU training',
)
flags.DEFINE_integer('mlperf_profile_steps', 20, 'number of steps to profile')
flags.DEFINE_string(
'mlperf_bucket',
None,
'GCS bucket for mlperf results; only used for TPU runs.',
)
flags.DEFINE_string(
'mlperf_gcs_resnet_checkpoint',
'gs://p3rf-mlperf/resnet50-checkpoint-2018-02-07/model.ckpt-112603',
'A ResNet backbone trained on the ImageNet dataset.',
)
flags.DEFINE_string(
'mlperf_transformer_decode_dir', '', 'Transformer decode directory'
)
flags.DEFINE_string(
'wmt_data_dir',
'gs://p3rf-mlperf/mlperf_v0.6_nv_transformer',
'Directory where the wmt dataset is stored',
)
flags.DEFINE_string(
'coco_data_dir',
'gs://p3rf-mlperf/coco2017',
'Directory where the coco dataset is stored',
)
flags.DEFINE_string(
'gnmt_data_dir',
'gs://p3rf-mlperf/mlperf_v0.6_nv_gnmt',
'Directory where the nv 0.6 WMT dataset is stored',
)
flags.DEFINE_string(
'bert_data_dir',
'gs://p3rf-mlperf/mlperf_training_v2.0_nv_bert',
'Directory where the nv bert dataset is stored.',
)
flags.DEFINE_string(
'minigo_model_dir',
'',
'Directory on GCS to copy minigo source data from. Files '
'will be copied from subdirectories of src_dir '
'corresponding to the board size.',
)
BERT_BATCH_SIZE = flags.DEFINE_integer(
'mlperf_bert_batch_size', None, 'The batch size to use for training BERT.'
)
RESNET_BATCH_SIZE = flags.DEFINE_integer(
'mlperf_resnet_batch_size',
None,
'The batch size to use for training ResNet.',
)
MASKRCNN_BATCH_SIZE = flags.DEFINE_integer(
'mlperf_maskrcnn_batch_size',
None,
'The batch size to use for training Mask RCNN .',
)
HYPERTHREADS = flags.DEFINE_bool(
'mlperf_hyperthreads', True, 'enable or disable binding to hyperthreads'
)
VERSION = flags.DEFINE_enum(
'mlperf_training_version',
'v2.0',
['v1.0', 'v1.1', 'v2.0', 'v3.1'],
'MLPerf training version to run.',
)
RE_FLOAT = r'\d+\.\d+'
def GetConfig(user_config):
"""Load and return benchmark config.
Args:
user_config: user supplied configuration (flags and config file)
Returns:
loaded benchmark configuration
"""
config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
return config
def _UpdateBenchmarkSpecWithFlags(benchmark_spec):
"""Update the benchmark_spec with supplied command line flags.
Args:
benchmark_spec: benchmark specification to update
"""
benchmark_spec.imagenet_data_dir = FLAGS.imagenet_data_dir
benchmark_spec.benchmark = FLAGS.mlperf_benchmark
benchmark_spec.wmt_data_dir = FLAGS.wmt_data_dir
benchmark_spec.coco_data_dir = FLAGS.coco_data_dir
benchmark_spec.gnmt_data_dir = FLAGS.gnmt_data_dir
benchmark_spec.bert_data_dir = FLAGS.bert_data_dir
benchmark_spec.gcp_service_account = FLAGS.gcp_service_account
def _DownloadData(data_dir, data_path, vm):
"""Download remote benchmark data to local.
Args:
data_dir: remote benchmark location
data_path: local benchmark location
vm: vm to download the data
"""
vm.Install('google_cloud_sdk')
vm.RemoteCommand(
'if [ ! -d "{data_path}" ]; then '
' sudo mkdir -p {data_path} && '
' sudo chmod a+w {data_path} && '
' {gsutil_path} -m cp -r {data_dir}/* {data_path} ;'
'fi'.format(
data_dir=data_dir,
gsutil_path=google_cloud_sdk.GSUTIL_PATH,
data_path=data_path,
)
)
DownloadData = _DownloadData
def PrepareBenchmark(benchmark_spec, vm=None):
"""Install and set up MLPerf on the target vm.
Args:
benchmark_spec: The benchmark specification
vm: The VM to work on
Raises:
errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
"""
_UpdateBenchmarkSpecWithFlags(benchmark_spec)
vm = vm or benchmark_spec.vms[0]
has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm)
if has_gpu:
vm.Install('cuda_toolkit')
if bool(benchmark_spec.tpus) and nvidia_driver.CheckNvidiaGpuExists(vm):
raise errors.Config.InvalidValue(
'Invalid configuration. GPUs and TPUs can not both present in the'
' config.'
)
vm.RemoteCommand(
f'if [ ! -d "$HOME/training_results_{VERSION.value}" ]; then git clone'
f' https://github.com/mlcommons/training_results_{VERSION.value}.git ; fi'
)
vm.Install('pip')
if not HYPERTHREADS.value:
if BERT in benchmark_spec.benchmark:
vm_util.ReplaceText(
vm,
"'bind_pyt'",
"'bind_pyt' '--no_hyperthreads'",
f'training_results_{VERSION.value}/NVIDIA/benchmarks/bert/'
'implementations/pytorch/run_with_docker.sh',
)
elif MASK in benchmark_spec.benchmark:
vm_util.ReplaceText(
vm,
"'bind_launch'",
"'bind_launch' '--no_hyperthreads'",
f'training_results_{VERSION.value}/NVIDIA/benchmarks/maskrcnn/'
'implementations/pytorch/run_and_time.sh',
)
elif RESNET in benchmark_spec.benchmark:
vm_util.ReplaceText(
vm,
'--cpu=exclusive',
'--cpu=exclusive,nosmt',
f'training_results_{VERSION.value}/NVIDIA/benchmarks/resnet/'
'implementations/mxnet/run_and_time.sh',
)
def PrepareRunner(benchmark_spec, vm=None):
"""Install and set up MLPerf on the target vm.
Args:
benchmark_spec: The benchmark specification
vm: The VM to work on
Raises:
errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
"""
vm = vm or benchmark_spec.vms[0]
if benchmark_spec.tpus:
if vm == benchmark_spec.vms[0]:
storage_service = gcs.GoogleCloudStorageService()
benchmark_spec.storage_service = storage_service
if FLAGS.mlperf_bucket:
bucket = FLAGS.mlperf_bucket
benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}'
else:
bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri)
benchmark_spec.model_dir = f'gs://{bucket}'
benchmark_spec.bucket = bucket
location = benchmark_spec.tpu_groups['train'].GetZone()
storage_service.PrepareService(util.GetRegionFromZone(location))
storage_service.MakeBucket(bucket)
storage_service.AclBucket(
benchmark_spec.gcp_service_account, gcs.WRITER, bucket
)
# For MLPerf 1.0, the benchmake code of different hardware are different.
if (
benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'
):
run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
else:
raise ValueError(
'MLPerf configurations do not support the hardware in PKB. PKB may '
'need to be updated if this is a new TPU type.'
)
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark
code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
if MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark:
# Install the coco package, to load the coco dataset for Mask-RCNN
# and SSD benchmarks.
# TODO(user): coco whl package for python 3.5
vm.RemoteCommand(
'cd /tmp && wget https://storage.cloud.google.com/'
'mlperf_artifcats/v0.6_training/'
'coco-1.1-cp36-cp36m-linux_x86_64.whl' # NOTYPO
)
setup_script = posixpath.join(run_path, 'setup.sh')
vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script)
vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script)
vm.RemoteCommand(
'chmod 755 {script} && {script}'.format(script=setup_script)
)
if MASK not in benchmark_spec.benchmark:
vm.RemoteCommand(
'pip3 uninstall -y tf-estimator-nightly && '
'pip3 install tf-estimator-nightly==1.14.0.dev2019051801'
)
if RESNET in benchmark_spec.benchmark:
data_dir = benchmark_spec.imagenet_data_dir
elif TRANSFORMER in benchmark_spec.benchmark:
data_dir = benchmark_spec.wmt_data_dir
elif MASK in benchmark_spec.benchmark:
data_dir = benchmark_spec.coco_data_dir
elif GNMT in benchmark_spec.benchmark:
data_dir = benchmark_spec.gnmt_data_dir
elif SSD in benchmark_spec.benchmark:
data_dir = benchmark_spec.coco_data_dir
elif BERT in benchmark_spec.benchmark:
data_dir = benchmark_spec.bert_data_dir
else:
raise ValueError(
'Unknown operation, cannot find {} in benchmark'.format(
benchmark_spec.benchmark
)
)
run_script = posixpath.join(run_path, 'run_and_time.sh')
data_dir = data_dir.replace('/', r'\/')
checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/')
decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/')
tpu = benchmark_spec.tpu_groups['train'].GetName()
vm_util.ReplaceText(
vm,
'--model_dir=.*',
r'--model_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script
)
vm_util.ReplaceText(
vm,
'--training_file_pattern=.*',
r'--training_file_pattern={}\/train-* \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--validation_file_pattern=.*',
r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--val_json_file=.*',
r'--val_json_file={}\/instances_val2017.json \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--resnet_checkpoint=.*',
r'--resnet_checkpoint={} \\\\'.format(checkpoint),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_from_file=.*',
r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_reference=.*',
r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_to_file=.*',
r'--decode_to_file={}\/decode.transformer_mlperf_tpu.'
r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script
)
vm_util.ReplaceText(
vm,
'--output_dir=.*',
r'--output_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm,
'--cloud_tpu_name=.*',
r'--cloud_tpu_name={} \\\\'.format(tpu),
run_script,
)
vm_util.ReplaceText(
vm,
'--out_dir=.*',
r'--out_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script
)
vm.RemoteCommand('chmod 755 {}'.format(run_script))
if GNMT in benchmark_spec.benchmark:
metric_script = posixpath.join(code_path, model, 'metric.py')
vm_util.ReplaceText(
vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script
)
else:
benchmark_spec.model_dir = '/tmp'
vm.Install('nvidia_docker')
docker.AddUser(vm)
vm.RemoteCommand('sudo usermod -aG docker $USER')
vm.RemoteCommand('if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')
if RESNET in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/resnet/implementations/mxnet'
' && docker build --network=host . -t'
' mlperf-nvidia:image_classification'
)
_DownloadData(
benchmark_spec.imagenet_data_dir,
posixpath.join('/data', 'imagenet'),
vm,
)
if TRANSFORMER in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/transformer/implementations/pytorch'
' && docker build --network=host . -t mlperf-nvidia:translation'
)
_DownloadData(
benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm
)
if MINIGO in benchmark_spec.benchmark:
build_path = f'training_results_{VERSION.value}/NVIDIA/benchmarks/minigo/implementations/tensorflow'
run_script = posixpath.join(build_path, 'run_and_time.sh')
vm_util.ReplaceText(
vm,
'get_data.py',
'get_data.py --src_dir={}'.format(
FLAGS.minigo_model_dir.replace('/', r'\/')
),
run_script,
)
vm.RemoteCommand(
'cd {} && docker build --network=host -t '
'mlperf-nvidia:minigo .'.format(build_path)
)
if MASK in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch'
' && docker build --network=host -t'
' mlperf-nvidia:object_detection . '
)
_DownloadData(
benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm
)
if GNMT in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/gnmt/implementations/pytorch'
' && docker build --network=host -t mlperf-nvidia:rnn_translator . '
)
_DownloadData(
benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm
)
if SSD in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/ssd/implementations/pytorch'
' && docker build --network=host -t'
' mlperf-nvidia:single_stage_detector . '
)
_DownloadData(
benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm
)
if BERT in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/bert/implementations/pytorch'
' && docker build --network=host -t mlperf-nvidia:language_model . '
)
_DownloadData(
benchmark_spec.bert_data_dir, posixpath.join('/data', 'bert_data'), vm
)
def _GetChangesForMask(config_sed_input):
"""Get changes to config and run scripts for MaskRCNN.
Also update train_mlperf.py if nvprof is used.
Args:
config_sed_input: Input list of sed pairs for config_DGXA100.sh.
Returns:
config_sed_output: Output list of sed pairs for config_DGXA100.sh.
"""
config_sed = config_sed_input
config_sed += [(
r'WALLTIME_MINUTES=100',
(
r'WALLTIME_MINUTES=100\n'
r'export CONT=mlperf-nvidia:object_detection\n'
r'export DATADIR=\/data\n'
r'export PKLDIR=\/data\/coco2017\/pkl_coco\n'
r'export NEXP=1'
),
)]
if MASKRCNN_BATCH_SIZE.value:
config_sed.append(
(r'BATCHSIZE=.*', rf'BATCHSIZE={MASKRCNN_BATCH_SIZE.value}')
)
return config_sed
def _GetChangesForResnet(config_sed_input):
"""Get changes to config and run scripts for Resnet.
Args:
config_sed_input: Input list of sed pairs for config_DGXA100.sh.
Returns:
config_sed_output: Output list of sed pairs for config_DGXA100.sh.
"""
config_sed = config_sed_input
config_sed.append((
r'.*config_DGXA100_common\.sh',
(
r'export CONT=mlperf-nvidia:image_classification\n'
r'export DATADIR=\/data\/imagenet\n'
r'export DISTRIBUTED=\\\"mpirun --allow-run-as-root --bind-to'
r' none --np \$DGXNGPU\\\"'
),
))
if RESNET_BATCH_SIZE.value:
config_sed.append(
(r'BATCHSIZE=.*', rf'BATCHSIZE={RESNET_BATCH_SIZE.value}')
)
return config_sed
def _GetChangesForBert(config_sed_input):
"""Get changes to config and run scripts for BERT.
Args:
config_sed_input: Input list of sed pairs for config_DGXA100.sh.
Returns:
config_sed_output: Output list of sed pairs for config_DGXA100.sh.
"""
config_sed = config_sed_input
config_sed.append((
r'.*config_DGXA100_common\.sh',
(r'export CONT=mlperf-nvidia:language_model\n' r'export NEXP=1'),
))
config_sed.append((
r'DATADIR=.*',
r'DATADIR=\/data\/bert_data\/hdf5\/training-4320\/hdf5_4320_shards_varlength',
))
config_sed.append((
r'DATADIR_PHASE2=.*',
r'DATADIR_PHASE2=\/data\/bert_data\/hdf5\/training-4320\/hdf5_4320_shards_varlength',
))
config_sed.append(
(r'EVALDIR=.*', r'EVALDIR=\/data\/bert_data\/hdf5\/eval_varlength')
)
config_sed.append(
(r'CHECKPOINTDIR=.*', r'CHECKPOINTDIR=\/data\/bert_data\/phase1')
)
config_sed.append((
r'CHECKPOINTDIR_PHASE1=.*',
r'CHECKPOINTDIR_PHASE1=\/data\/bert_data\/phase1',
))
if BERT_BATCH_SIZE.value:
config_sed.append((r'BATCHSIZE=.*', rf'BATCHSIZE={BERT_BATCH_SIZE.value}'))
return config_sed
def SedPairsToString(pairs):
"""Convert a list of sed pairs to a string for the sed command.
Args:
pairs: a list of pairs, indicating the replacement requests
Returns:
a string to supply to the sed command
"""
sed_str = '; '.join(['s/%s/%s/g' % pair for pair in pairs])
if pairs:
sed_str += ';'
return sed_str
def UpdateScriptForSmallGpuMem(vm: virtual_machine.BaseVirtualMachine) -> None:
"""Update the running script for small GPU memory.
Args:
vm: The VM to work on
"""
if (
nvidia_driver.GetGpuMem(vm) < 80000
and nvidia_driver.QueryNumberOfGpus(vm) >= 8
):
# A100 40G fails out of memory when creating dummy_eval_data on one GPU.
data_script = f'$HOME/training_results_{VERSION.value}/NVIDIA/benchmarks/resnet/implementations/mxnet/common/data.py'
vm_util.ReplaceText(
vm, r"mx\.Context\('gpu'\)", 'mx.gpu(hvd.local_rank())', data_script
)
def _UpdateScripts(benchmark_spec, vm):
"""Update the running scripts on the target vm.
Args:
benchmark_spec: The benchmark specification.
vm: The VM to work on
"""
benchmark = benchmark_spec.benchmark
vm = vm or benchmark_spec.vms[0]
config_sed = []
config_sed += [(r'DGXSYSTEM=.*', rf'DGXSYSTEM=\"{DGXSYSTEM}\"')]
gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm)
config_sed.append((
r'DGXNGPU=.*',
rf'DGXNGPU={gpus_per_node}\n'
r'export'
rf' CUDA_VISIBLE_DEVICES={",".join([str(gpu_number) for gpu_number in range(gpus_per_node)])}',
))
config_sed += [(
r'DGXNSOCKET=.*',
rf'DGXNSOCKET={vm.CheckLsCpu().socket_count}',
)]
config_sed += [(
r'DGXSOCKETCORES=.*',
rf'DGXSOCKETCORES={vm.CheckLsCpu().cores_per_socket}',
)]
model = 'maskrcnn' if MASK in benchmark else benchmark
framework = 'mxnet' if RESNET in benchmark else 'pytorch'
script_path = (
rf'$HOME/training_results_{VERSION.value}/NVIDIA/benchmarks/{model}/'
rf'implementations/{framework}'
)
config_files = [CONFIG]
if MASK in benchmark:
config_sed = _GetChangesForMask(config_sed)
config_files = ['config_DGXA100.sh']
elif RESNET in benchmark:
config_sed = _GetChangesForResnet(config_sed)
config_files = ['config_DGXA100_common.sh', 'config_DGXA100.sh']
UpdateScriptForSmallGpuMem(vm)
elif BERT in benchmark:
config_sed = _GetChangesForBert(config_sed)
config_files = ['config_DGXA100_common.sh', 'config_DGXA100_1x8x56x1.sh']
vm.RemoteCommand(
f'cd {script_path} && '
f'sed "{SedPairsToString(config_sed)}" '
f'{" ".join(config_files)} > {CONFIG} && '
f'chmod 755 {CONFIG} && '
f'sed -i "2 i source {CONFIG}" run_and_time.sh && '
f'sed -i "2 i source {CONFIG}" run_with_docker.sh'
)
def Prepare(benchmark_spec, vm=None):
"""Install and set up MLPerf on the target vm.
Args:
benchmark_spec: The benchmark specification
vm: The VM to work on
Raises:
errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
"""
PrepareBenchmark(benchmark_spec, vm)
_UpdateScripts(benchmark_spec, vm)
PrepareRunner(benchmark_spec, vm)
def _CreateMetadataDict(benchmark_spec):
"""Create metadata dict to be used in run results.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
metadata dict
"""
metadata = {
'use_tpu': bool(benchmark_spec.tpus),
'model_dir': benchmark_spec.model_dir,
'model': benchmark_spec.benchmark,
'version': VERSION.value,
}
vms = benchmark_spec.vms
num_vms = len(vms)
vm = vms[0]
gpus_per_node = nvidia_driver.QueryNumberOfGpus(vm)
total_gpus = gpus_per_node * num_vms
metadata.update(cuda_toolkit.GetMetadata(vm))
metadata['total_gpus'] = total_gpus
if benchmark_spec.tpus:
metadata.update({
'train_tpu_num_shards': benchmark_spec.tpu_groups[
'train'
].GetNumShards(),
'train_tpu_accelerator_type': benchmark_spec.tpu_groups[
'train'
].GetAcceleratorType(),
})
return metadata
def MakeSamplesFromOutput(metadata, output, use_tpu=False, model=RESNET):
"""Create samples containing metrics.
Args:
metadata: dict contains all the metadata that reports.
output: string, command output
use_tpu: bool, whether tpu is in use
model: string, model name
Example output:
perfkitbenchmarker/tests/linux_benchmarks/mlperf_benchmark_test.py
Returns:
Samples containing training metrics.
"""
samples = []
if RESNET in model:
results = regex_util.ExtractAllMatches(
f'Speed: ({RE_FLOAT}) samples/sec', output
)
results.extend(
regex_util.ExtractAllMatches(f'"imgs_sec": ({RE_FLOAT})', output)
)
results.extend(
regex_util.ExtractAllMatches(
f'"key": "throughput", "value": ({RE_FLOAT})', output
)
)
elif TRANSFORMER in model:
results = re.findall(r'wps=(\S+),', output)
elif GNMT in model:
results = re.findall(r'Tok/s (\S+)', output)
elif SSD in model:
results = re.findall(r'avg. samples / sec: (\S+)', output)
elif MASK in model:
results = regex_util.ExtractAllMatches(
f'"throughput": ({RE_FLOAT})', output
)
results.extend(
regex_util.ExtractAllMatches(
f'"key": "throughput", "value": ({RE_FLOAT})', output
)
)
results.extend(
regex_util.ExtractAllMatches(
f'MLPERF METRIC THROUGHPUT=({RE_FLOAT}) iterations / s', output
)
)
elif BERT in model:
results = regex_util.ExtractAllMatches(
f"'training_sequences_per_second': ({RE_FLOAT})", output
)
for speed in results:
samples.append(
sample.Sample('speed', float(speed), 'samples/sec', metadata)
)
if not use_tpu:
if MINIGO in model:
times = regex_util.ExtractAllMatches(r'RESULT,.*,(\d+),.*,.*', output)
else:
times = regex_util.ExtractAllMatches(r'RESULT,.*,.*,(\d+),.*,.*', output)
samples.append(sample.Sample('Time', int(times[0]), 'seconds', metadata))
samples.extend(MakeMLPerfSamplesFromOutput(metadata, output))
return samples
def MakeMLPerfSamplesFromOutput(metadata, output):
"""Create MLPerf log samples containing metrics.
Args:
metadata: dict contains all the metadata that reports.
output: string, command output
Returns:
Samples containing training metrics.
"""
samples = []
for mllog in regex_util.ExtractAllMatches(r':::MLLOG (.*)', output):
data = json.loads(mllog)
mlperf_metadata = data['metadata']
mlperf_metadata.update(metadata)
for key in ('namespace', 'event_type', 'value'):
mlperf_metadata[key] = data[key]
samples.append(
sample.Sample(
data['key'],
None,
'',
mlperf_metadata,
timestamp=data['time_ms'] / 1000,
)
)
return samples
def Run(benchmark_spec):
"""Run MLPerf on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of sample.Sample objects.
"""
_UpdateBenchmarkSpecWithFlags(benchmark_spec)
vm = benchmark_spec.vms[0]
if benchmark_spec.tpus:
# For MLPerf 1.0, the benchmake code of different hardware are different.
if (
benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'
):
run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark
mlperf_benchmark_cmd = (
'cd {code_path} && '
'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
'cd {model} && '
'{run_path}/run_and_time.sh'.format(
code_path=code_path, model=model, run_path=run_path
)
)
if SSD in benchmark_spec.benchmark:
mlperf_benchmark_cmd = (
'export MLP_GCS_RESNET_CHECKPOINT={checkpoint} && {cmd}'.format(
checkpoint=FLAGS.mlperf_gcs_resnet_checkpoint,
cmd=mlperf_benchmark_cmd,
)
)
else:
raise ValueError(
'MLPerf configurations do not support the hardware in PKB. PKB may '
'need to be updated if this is a new TPU type.'
)
else:
run_sub_paths = {
RESNET: 'resnet/implementations/mxnet',
TRANSFORMER: 'transformer/implementations/pytorch',
MINIGO: 'minigo/implementations/tensorflow',
MASK: 'maskrcnn/implementations/pytorch',
GNMT: 'gnmt/implementations/pytorch',
SSD: 'ssd/implementations/pytorch',
BERT: 'bert/implementations/pytorch',
}
benchmark_path = f'$HOME/training_results_{VERSION.value}/NVIDIA/benchmarks'
run_path = posixpath.join(
benchmark_path, run_sub_paths[benchmark_spec.benchmark]
)
env = {
'DGXSYSTEM': DGXSYSTEM,
'NEXP': 1,
'PULL': 0,
'LOGDIR': f'/tmp/{benchmark_spec.benchmark}',
}
envs = {
RESNET: {},
TRANSFORMER: {'DATADIR': '/data/wmt/utf8'},
MINIGO: {'CONT': 'mlperf-nvidia:minigo'},
MASK: {},
GNMT: {'DATADIR': '/data/gnmt'},
SSD: {'DATADIR': '/data'},
BERT: {},
}
env.update(envs[benchmark_spec.benchmark])
run_script = posixpath.join(run_path, 'run_with_docker.sh')
vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
vm_util.ReplaceText(vm, 'docker exec -it', 'docker exec -t', run_script)
if benchmark_spec.benchmark == RESNET:
vm_util.ReplaceText(
vm, r'mpirun.*run_and_time\.sh', r'.\/run_and_time.sh', run_script
)
env = ' '.join(f'{key}={value}' for key, value in env.items())
if nvidia_driver.CheckNvidiaGpuExists(vm):
env = f'{tensorflow.GetEnvironmentVars(vm)} {env}'
mlperf_benchmark_cmd = (
f'chmod 755 {run_script} && cd {run_path} && {env} {run_script}'
)
samples = []
metadata = _CreateMetadataDict(benchmark_spec)
stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd)
if NONE in FLAGS.mlperf_profiler:
samples.extend(
MakeSamplesFromOutput(
metadata,
stdout,
use_tpu=bool(benchmark_spec.tpus),
model=benchmark_spec.benchmark,
)
)
return samples
def Cleanup(benchmark_spec):
"""Cleanup MLPerf on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
"""
if benchmark_spec.tpus and FLAGS.mlperf_bucket is None:
benchmark_spec.storage_service.DeleteBucket(benchmark_spec.bucket)