in perfkitbenchmarker/linux_benchmarks/horovod_benchmark.py [0:0]
def RunWithVMs(vms, extra_envs=None):
"""Run Horovod on the cluster.
Args:
vms: A list of worker VMs.
extra_envs: A dictionary of environment variables.
Returns:
A list of sample.Sample objects.
"""
background_tasks.RunThreaded(
lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms
)
master_vm = vms[0]
gpus_per_node = nvidia_driver.QueryNumberOfGpus(master_vm)
num_vms = len(vms)
total_gpus = gpus_per_node * num_vms
# GCP should work out of the box with the deep learning image but the AWS
# image requires us to use the correct Tensorflow Python environment.
if FLAGS.cloud == 'AWS':
master_vm.RobustRemoteCommand('. anaconda3/bin/activate tensorflow_p37')
python_interpreter = 'anaconda3/envs/tensorflow_p37/bin/python'
else:
python_interpreter = '/opt/conda/bin/python'
nccl_params = {
'TF_CPP_MIN_LOG_LEVEL': 0,
'NCCL_SOCKET_IFNAME': '^lo,docker0',
'NCCL_DEBUG': 'INFO',
}
if FLAGS.horovod_timeline:
nccl_params['HOROVOD_TIMELINE_MARK_CYCLES'] = 1
nccl_params['HOROVOD_TIMELINE'] = f'{vm_util.VM_TMP_DIR}/timeline.json'
if FLAGS.nccl_cuda_visible_devices:
nccl_params['CUDA_VISIBLE_DEVICES'] = FLAGS.nccl_cuda_visible_devices
if FLAGS.nccl_extra_params:
for extra_param in FLAGS.nccl_extra_params:
k, v = extra_param.split('=', 1)
nccl_params[k] = v
if extra_envs:
nccl_params.update(extra_envs)
run_command = (
'{mpi} -np {num_gpus} -hostfile {host_file} '
'-mca plm_rsh_no_tree_spawn 1 '
'--allow-run-as-root '
'-bind-to socket -map-by slot '
'{nccl_params} '
'-mca pml ob1 -mca btl ^openib '
'-mca btl_tcp_if_exclude lo,docker0 '
'{python} '
).format(
mpi=FLAGS.nccl_mpi,
num_gpus=total_gpus,
host_file=MACHINEFILE,
python=python_interpreter,
nccl_params=' '.join(
[f'-x {key}={value}' for key, value in nccl_params.items()]
),
)
if FLAGS.horovod_model == 'resnet-50':
run_flags = {
'arch': 'resnet50',
'mode': 'training_benchmark',
'warmup_steps': 101,
'results_dir': '/tmp/models',
'gpu_memory_fraction': 0.95,
'static_loss_scale': 128,
'lr_init': 0.016,
'lr_warmup_epochs': 8,
'momentum': 0.875,
'weight_decay': 3.0517578125e-05,
'iter_unit': 'batch',
}
run_flags.update({
'batch_size': FLAGS.horovod_batch_size,
'num_iter': FLAGS.horovod_num_steps,
})
if FLAGS.horovod_precision == 'fp16':
run_flags['amp'] = None
# Load ImageNet training data from GCS if benchmark is not in synthetic mode
if not FLAGS.horovod_synthetic:
run_flags['data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'
run_command += (
'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
)
run_command += ' '.join([
'--{}'.format(key) if value is None else '--{}={}'.format(key, value)
for key, value in sorted(run_flags.items())
])
elif FLAGS.horovod_model == 'resnext-101':
run_flags = {
'arch': 'resnext101-32x4d',
'mode': 'training_benchmark',
'warmup_steps': 101,
'results_dir': '/tmp/models',
'gpu_memory_fraction': 0.95,
'use_static_loss_scaling': None,
'loss_scale': 128,
'lr_init': 0.016,
'lr_warmup_epochs': 8,
'momentum': 0.875,
'weight_decay': 3.0517578125e-05,
'weight_init': 'fan_in',
'iter_unit': 'batch',
}
run_flags.update({
'precision': FLAGS.horovod_precision,
'batch_size': FLAGS.horovod_batch_size,
'num_iter': FLAGS.horovod_num_steps,
})
# Load ImageNet training data from GCS if benchmark is not in synthetic mode
if not FLAGS.horovod_synthetic:
run_flags['data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'
run_command += (
'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
)
run_command += ' '.join([
'--{}'.format(key) if value is None else '--{}={}'.format(key, value)
for key, value in sorted(run_flags.items())
])
elif FLAGS.horovod_model.startswith('bert'): # bert
if not FLAGS.horovod_bert_finetune:
raise NotImplementedError('BERT pretraining is not supported.')
bert_dir = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/google_pretrained_weights/{}'.format(
'uncased_L-12_H-768_A-12'
if FLAGS.horovod_model == 'bert-base'
else 'uncased_L-24_H-1024_A-16'
)
squad_train_file = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json'
run_flags = {
'vocab_file': '{}/vocab.txt'.format(bert_dir),
'bert_config_file': '{}/bert_config.json'.format(bert_dir),
'init_checkpoint': '{}/bert_model.ckpt'.format(bert_dir),
'do_train': None,
'train_file': squad_train_file,
'learning_rate': 5e-6,
'output_dir': '/tmp/models',
'horovod': None,
'dllog_path': '/tmp/bert_dllog.json',
'save_checkpoints_steps': 0,
}
run_flags.update({
'precision': FLAGS.horovod_precision,
'train_batch_size': FLAGS.horovod_batch_size,
'num_train_epochs': FLAGS.horovod_num_steps,
'max_seq_length': FLAGS.horovod_max_seq_len,
'doc_stride': 64 if FLAGS.horovod_max_seq_len == 128 else 128,
'amp': FLAGS.horovod_precision == 'fp16',
})
run_command += (
'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py '
)
run_command += ' '.join([
'--{}'.format(key) if value is None else '--{}={}'.format(key, value)
for key, value in sorted(run_flags.items())
])
else:
run_command += (
'tensorpack/examples/FasterRCNN/train.py --config '
'BACKBONE.WEIGHTS=ImageNet-R50-AlignPadding.npz '
'DATA.BASEDIR=coco '
'TRAINER=horovod '
'TRAIN.EVAL_PERIOD=0 '
# LR_SCHEDULE means equivalent steps when the total batch size is 8.
'TRAIN.LR_SCHEDULE="[{step}, {step}, {step}]" '
'--logdir {log_dir}/maskrcnn '
).format(
log_dir=vm_util.VM_TMP_DIR,
step=FLAGS.horovod_num_steps * total_gpus // 8,
)
stdout, stderr = master_vm.RobustRemoteCommand(run_command)
if FLAGS.horovod_timeline:
master_vm.PullFile(
vm_util.GetTempDir(), '{}/timeline.json'.format(vm_util.VM_TMP_DIR)
)
return _MakeSamplesFromOutput(vms, stdout, stderr)