in perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py [0:0]
def PrepareRunner(benchmark_spec, vm=None):
"""Install and set up MLPerf on the target vm.
Args:
benchmark_spec: The benchmark specification
vm: The VM to work on
Raises:
errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
"""
vm = vm or benchmark_spec.vms[0]
if benchmark_spec.tpus:
if vm == benchmark_spec.vms[0]:
storage_service = gcs.GoogleCloudStorageService()
benchmark_spec.storage_service = storage_service
if FLAGS.mlperf_bucket:
bucket = FLAGS.mlperf_bucket
benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}'
else:
bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri)
benchmark_spec.model_dir = f'gs://{bucket}'
benchmark_spec.bucket = bucket
location = benchmark_spec.tpu_groups['train'].GetZone()
storage_service.PrepareService(util.GetRegionFromZone(location))
storage_service.MakeBucket(bucket)
storage_service.AclBucket(
benchmark_spec.gcp_service_account, gcs.WRITER, bucket
)
# For MLPerf 1.0, the benchmake code of different hardware are different.
if (
benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024'
or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'
):
run_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
else:
raise ValueError(
'MLPerf configurations do not support the hardware in PKB. PKB may '
'need to be updated if this is a new TPU type.'
)
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark
code_path = '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'.format(
version=VERSION.value,
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType(),
)
vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
if MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark:
# Install the coco package, to load the coco dataset for Mask-RCNN
# and SSD benchmarks.
# TODO(user): coco whl package for python 3.5
vm.RemoteCommand(
'cd /tmp && wget https://storage.cloud.google.com/'
'mlperf_artifcats/v0.6_training/'
'coco-1.1-cp36-cp36m-linux_x86_64.whl' # NOTYPO
)
setup_script = posixpath.join(run_path, 'setup.sh')
vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script)
vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script)
vm.RemoteCommand(
'chmod 755 {script} && {script}'.format(script=setup_script)
)
if MASK not in benchmark_spec.benchmark:
vm.RemoteCommand(
'pip3 uninstall -y tf-estimator-nightly && '
'pip3 install tf-estimator-nightly==1.14.0.dev2019051801'
)
if RESNET in benchmark_spec.benchmark:
data_dir = benchmark_spec.imagenet_data_dir
elif TRANSFORMER in benchmark_spec.benchmark:
data_dir = benchmark_spec.wmt_data_dir
elif MASK in benchmark_spec.benchmark:
data_dir = benchmark_spec.coco_data_dir
elif GNMT in benchmark_spec.benchmark:
data_dir = benchmark_spec.gnmt_data_dir
elif SSD in benchmark_spec.benchmark:
data_dir = benchmark_spec.coco_data_dir
elif BERT in benchmark_spec.benchmark:
data_dir = benchmark_spec.bert_data_dir
else:
raise ValueError(
'Unknown operation, cannot find {} in benchmark'.format(
benchmark_spec.benchmark
)
)
run_script = posixpath.join(run_path, 'run_and_time.sh')
data_dir = data_dir.replace('/', r'\/')
checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/')
decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/')
tpu = benchmark_spec.tpu_groups['train'].GetName()
vm_util.ReplaceText(
vm,
'--model_dir=.*',
r'--model_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script
)
vm_util.ReplaceText(
vm,
'--training_file_pattern=.*',
r'--training_file_pattern={}\/train-* \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--validation_file_pattern=.*',
r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--val_json_file=.*',
r'--val_json_file={}\/instances_val2017.json \\\\'.format(data_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--resnet_checkpoint=.*',
r'--resnet_checkpoint={} \\\\'.format(checkpoint),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_from_file=.*',
r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_reference=.*',
r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir),
run_script,
)
vm_util.ReplaceText(
vm,
'--decode_to_file=.*',
r'--decode_to_file={}\/decode.transformer_mlperf_tpu.'
r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script
)
vm_util.ReplaceText(
vm,
'--output_dir=.*',
r'--output_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm,
'--cloud_tpu_name=.*',
r'--cloud_tpu_name={} \\\\'.format(tpu),
run_script,
)
vm_util.ReplaceText(
vm,
'--out_dir=.*',
r'--out_dir=gs:\/\/{} \\\\'.format(bucket),
run_script,
)
vm_util.ReplaceText(
vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script
)
vm.RemoteCommand('chmod 755 {}'.format(run_script))
if GNMT in benchmark_spec.benchmark:
metric_script = posixpath.join(code_path, model, 'metric.py')
vm_util.ReplaceText(
vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script
)
else:
benchmark_spec.model_dir = '/tmp'
vm.Install('nvidia_docker')
docker.AddUser(vm)
vm.RemoteCommand('sudo usermod -aG docker $USER')
vm.RemoteCommand('if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')
if RESNET in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/resnet/implementations/mxnet'
' && docker build --network=host . -t'
' mlperf-nvidia:image_classification'
)
_DownloadData(
benchmark_spec.imagenet_data_dir,
posixpath.join('/data', 'imagenet'),
vm,
)
if TRANSFORMER in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/transformer/implementations/pytorch'
' && docker build --network=host . -t mlperf-nvidia:translation'
)
_DownloadData(
benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm
)
if MINIGO in benchmark_spec.benchmark:
build_path = f'training_results_{VERSION.value}/NVIDIA/benchmarks/minigo/implementations/tensorflow'
run_script = posixpath.join(build_path, 'run_and_time.sh')
vm_util.ReplaceText(
vm,
'get_data.py',
'get_data.py --src_dir={}'.format(
FLAGS.minigo_model_dir.replace('/', r'\/')
),
run_script,
)
vm.RemoteCommand(
'cd {} && docker build --network=host -t '
'mlperf-nvidia:minigo .'.format(build_path)
)
if MASK in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch'
' && docker build --network=host -t'
' mlperf-nvidia:object_detection . '
)
_DownloadData(
benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm
)
if GNMT in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/gnmt/implementations/pytorch'
' && docker build --network=host -t mlperf-nvidia:rnn_translator . '
)
_DownloadData(
benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm
)
if SSD in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/ssd/implementations/pytorch'
' && docker build --network=host -t'
' mlperf-nvidia:single_stage_detector . '
)
_DownloadData(
benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm
)
if BERT in benchmark_spec.benchmark:
vm.RemoteCommand(
f'cd training_results_{VERSION.value}/NVIDIA/benchmarks/bert/implementations/pytorch'
' && docker build --network=host -t mlperf-nvidia:language_model . '
)
_DownloadData(
benchmark_spec.bert_data_dir, posixpath.join('/data', 'bert_data'), vm
)