def RunWithVMs()

in perfkitbenchmarker/linux_benchmarks/horovod_benchmark.py [0:0]


def RunWithVMs(vms, extra_envs=None):
  """Run Horovod on the cluster.

  Args:
    vms: A list of worker VMs.
    extra_envs: A dictionary of environment variables.

  Returns:
    A list of sample.Sample objects.
  """
  background_tasks.RunThreaded(
      lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms
  )
  master_vm = vms[0]

  gpus_per_node = nvidia_driver.QueryNumberOfGpus(master_vm)
  num_vms = len(vms)
  total_gpus = gpus_per_node * num_vms

  # GCP should work out of the box with the deep learning image but the AWS
  # image requires us to use the correct Tensorflow Python environment.
  if FLAGS.cloud == 'AWS':
    master_vm.RobustRemoteCommand('. anaconda3/bin/activate tensorflow_p37')
    python_interpreter = 'anaconda3/envs/tensorflow_p37/bin/python'
  else:
    python_interpreter = '/opt/conda/bin/python'

  nccl_params = {
      'TF_CPP_MIN_LOG_LEVEL': 0,
      'NCCL_SOCKET_IFNAME': '^lo,docker0',
      'NCCL_DEBUG': 'INFO',
  }

  if FLAGS.horovod_timeline:
    nccl_params['HOROVOD_TIMELINE_MARK_CYCLES'] = 1
    nccl_params['HOROVOD_TIMELINE'] = f'{vm_util.VM_TMP_DIR}/timeline.json'

  if FLAGS.nccl_cuda_visible_devices:
    nccl_params['CUDA_VISIBLE_DEVICES'] = FLAGS.nccl_cuda_visible_devices

  if FLAGS.nccl_extra_params:
    for extra_param in FLAGS.nccl_extra_params:
      k, v = extra_param.split('=', 1)
      nccl_params[k] = v

  if extra_envs:
    nccl_params.update(extra_envs)

  run_command = (
      '{mpi} -np {num_gpus} -hostfile {host_file} '
      '-mca plm_rsh_no_tree_spawn 1 '
      '--allow-run-as-root '
      '-bind-to socket -map-by slot '
      '{nccl_params} '
      '-mca pml ob1 -mca btl ^openib '
      '-mca btl_tcp_if_exclude lo,docker0 '
      '{python} '
  ).format(
      mpi=FLAGS.nccl_mpi,
      num_gpus=total_gpus,
      host_file=MACHINEFILE,
      python=python_interpreter,
      nccl_params=' '.join(
          [f'-x {key}={value}' for key, value in nccl_params.items()]
      ),
  )

  if FLAGS.horovod_model == 'resnet-50':
    run_flags = {
        'arch': 'resnet50',
        'mode': 'training_benchmark',
        'warmup_steps': 101,
        'results_dir': '/tmp/models',
        'gpu_memory_fraction': 0.95,
        'static_loss_scale': 128,
        'lr_init': 0.016,
        'lr_warmup_epochs': 8,
        'momentum': 0.875,
        'weight_decay': 3.0517578125e-05,
        'iter_unit': 'batch',
    }
    run_flags.update({
        'batch_size': FLAGS.horovod_batch_size,
        'num_iter': FLAGS.horovod_num_steps,
    })
    if FLAGS.horovod_precision == 'fp16':
      run_flags['amp'] = None

    # Load ImageNet training data from GCS if benchmark is not in synthetic mode
    if not FLAGS.horovod_synthetic:
      run_flags['data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'

    run_command += (
        'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
    )
    run_command += ' '.join([
        '--{}'.format(key) if value is None else '--{}={}'.format(key, value)
        for key, value in sorted(run_flags.items())
    ])
  elif FLAGS.horovod_model == 'resnext-101':
    run_flags = {
        'arch': 'resnext101-32x4d',
        'mode': 'training_benchmark',
        'warmup_steps': 101,
        'results_dir': '/tmp/models',
        'gpu_memory_fraction': 0.95,
        'use_static_loss_scaling': None,
        'loss_scale': 128,
        'lr_init': 0.016,
        'lr_warmup_epochs': 8,
        'momentum': 0.875,
        'weight_decay': 3.0517578125e-05,
        'weight_init': 'fan_in',
        'iter_unit': 'batch',
    }
    run_flags.update({
        'precision': FLAGS.horovod_precision,
        'batch_size': FLAGS.horovod_batch_size,
        'num_iter': FLAGS.horovod_num_steps,
    })

    # Load ImageNet training data from GCS if benchmark is not in synthetic mode
    if not FLAGS.horovod_synthetic:
      run_flags['data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet'

    run_command += (
        'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py '
    )
    run_command += ' '.join([
        '--{}'.format(key) if value is None else '--{}={}'.format(key, value)
        for key, value in sorted(run_flags.items())
    ])
  elif FLAGS.horovod_model.startswith('bert'):  # bert
    if not FLAGS.horovod_bert_finetune:
      raise NotImplementedError('BERT pretraining is not supported.')
    bert_dir = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/google_pretrained_weights/{}'.format(
        'uncased_L-12_H-768_A-12'
        if FLAGS.horovod_model == 'bert-base'
        else 'uncased_L-24_H-1024_A-16'
    )
    squad_train_file = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json'
    run_flags = {
        'vocab_file': '{}/vocab.txt'.format(bert_dir),
        'bert_config_file': '{}/bert_config.json'.format(bert_dir),
        'init_checkpoint': '{}/bert_model.ckpt'.format(bert_dir),
        'do_train': None,
        'train_file': squad_train_file,
        'learning_rate': 5e-6,
        'output_dir': '/tmp/models',
        'horovod': None,
        'dllog_path': '/tmp/bert_dllog.json',
        'save_checkpoints_steps': 0,
    }
    run_flags.update({
        'precision': FLAGS.horovod_precision,
        'train_batch_size': FLAGS.horovod_batch_size,
        'num_train_epochs': FLAGS.horovod_num_steps,
        'max_seq_length': FLAGS.horovod_max_seq_len,
        'doc_stride': 64 if FLAGS.horovod_max_seq_len == 128 else 128,
        'amp': FLAGS.horovod_precision == 'fp16',
    })
    run_command += (
        'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py '
    )
    run_command += ' '.join([
        '--{}'.format(key) if value is None else '--{}={}'.format(key, value)
        for key, value in sorted(run_flags.items())
    ])
  else:
    run_command += (
        'tensorpack/examples/FasterRCNN/train.py --config '
        'BACKBONE.WEIGHTS=ImageNet-R50-AlignPadding.npz '
        'DATA.BASEDIR=coco '
        'TRAINER=horovod '
        'TRAIN.EVAL_PERIOD=0 '
        # LR_SCHEDULE means equivalent steps when the total batch size is 8.
        'TRAIN.LR_SCHEDULE="[{step}, {step}, {step}]" '
        '--logdir {log_dir}/maskrcnn '
    ).format(
        log_dir=vm_util.VM_TMP_DIR,
        step=FLAGS.horovod_num_steps * total_gpus // 8,
    )
  stdout, stderr = master_vm.RobustRemoteCommand(run_command)

  if FLAGS.horovod_timeline:
    master_vm.PullFile(
        vm_util.GetTempDir(), '{}/timeline.json'.format(vm_util.VM_TMP_DIR)
    )
  return _MakeSamplesFromOutput(vms, stdout, stderr)