def _UpdateScripts()

in perfkitbenchmarker/linux_benchmarks/mlperf_multiworkers_benchmark.py [0:0]


def _UpdateScripts(benchmark_spec, node_rank):
  """Update the running scripts on the target vm.

  Args:
    benchmark_spec: The benchmark specification.
    node_rank: int, The rank of the node for multi-node distributed training
  """
  vm = benchmark_spec.vms[node_rank]
  benchmark = benchmark_spec.benchmark

  # TODO(tohaowu) Change config and script using a patch file.
  # request pairs to the sed command
  # each pair('str_A', 'str_B') indicates a request "replace anything
  # matching str_A to str_B" for a specific file
  config_sed = []
  config_sed += [(r'DGXSYSTEM=.*', rf'DGXSYSTEM=\"{DGXSYSTEM}\"')]
  config_sed += [(
      r'DGXNNODES=.*',
      r'DGXNNODES={num_vms}'.format(num_vms=benchmark_spec.num_vms),
  )]
  config_sed += [(
      r'DGXNGPU=.*',
      (
          rf'DGXNGPU={benchmark_spec.gpus_per_vm}\nexport'
          rf' CUDA_VISIBLE_DEVICES={",".join([str(i) for i in range(benchmark_spec.gpus_per_vm)])}'
      ),
  )]

  config_sed += [(
      r'DGXNSOCKET=.*',
      r'DGXNSOCKET={nsockets}'.format(nsockets=vm.CheckLsCpu().socket_count),
  )]
  config_sed += [(
      r'DGXSOCKETCORES=.*',
      r'DGXSOCKETCORES={ncores}'.format(
          ncores=vm.CheckLsCpu().cores_per_socket
      ),
  )]

  run_and_time_sed = []
  run_and_time_sed += [(r'run_training.sh', r'run_training1.sh')]
  run_and_time_sed += [(r'DGXSYSTEM=.*', rf'DGXSYSTEM=\"{DGXSYSTEM}\"')]

  if FLAGS.mlperf_keep_nccl_log:
    run_and_time_sed += [(
        r'#\!\/bin\/bash',
        (
            r'#\!\/bin\/bash\n'
            r'export NCCL_DEBUG=INFO\n'
            r'export NCCL_DEBUG_SUBSYS=ALL\n'
            r'export NCCL_DEBUG_FILE=\/results\/%h.%p.nccl'
        ),
    )]

  nccl_exports = _GetNcclParams() if FLAGS.nccl_extra_params else r''
  run_and_time_sed += [(
      r'#!\/bin\/bash',
      r'#!\/bin\/bash\n' rf'{nccl_exports}',
  )]

  run_sed = []
  run_sed += [(r'SYSLOGGING=1', r'SYSLOGGING=0')]
  run_sed += [(
      r'env [|] grep SLURM',
      r'export SLURM_NNODES={num_vms}'.format(num_vms=benchmark_spec.num_vms),
  )]
  run_sed += [(
      r'data -v \$LOGDIR',
      r'data -v \$(pwd):\/workspace\/{model}1 -v \$LOGDIR'.format(
          model=benchmark
      ),
  )]
  run_sed += [(
      r'scontrol show hostname',
      r'mpirun -hostfile \$HOME\/{hostfile} -N 1 hostname -I '
      r'\| awk \'{{print \$1}}\' '.format(hostfile=HOSTFILE),
  )]
  run_sed += [(
      r'srun --mem=0 -N 1 -n 1 -w \$hostn',
      r'mpirun -N 1 -n 1 -H \$hostn',
  )]
  run_sed += [(r'sleep 30', r'sleep 60')]
  run_sed += [(r'docker exec -it', r'docker exec -t')]
  run_sed += [(r'run_and_time.sh', r'run_and_time1.sh')]

  if FLAGS.aws_efa or FLAGS.azure_infiniband:
    stdout, _ = vm.RemoteCommand('ls -d /dev/infiniband/*')
    devices = [device.replace('/', '\\/') for device in stdout.split()]
    device_args = ' '.join(f'--device={device}' for device in devices)
    run_sed += [(r'nvidia-docker run', rf'nvidia-docker run {device_args}')]

  if FLAGS.azure_infiniband:
    run_sed.append((
        r'_cont_mounts=(',
        r'_cont_mounts=(\"--volume=\/opt\/microsoft:\/opt\/microsoft\" ',
    ))
    run_sed.append((
        r'^CONT_MOUNTS=\(.*\)$',
        r'CONT_MOUNTS=\"\1 --volume=\/opt\/microsoft:\/opt\/microsoft\"',
    ))

  nvprof_flags = r'-f -o \/results\/%h.%p.nvprof --profile-child-processes'

  script_path = (
      r'$HOME/training_results_{version}/NVIDIA/benchmarks/{model}'
      r'/implementations/{framework}'.format(
          version=FLAGS.mlperf_training_version,
          model='maskrcnn' if mlperf_benchmark.MASK in benchmark else benchmark,
          framework='mxnet'
          if mlperf_benchmark.RESNET in benchmark
          else 'pytorch',
      )
  )

  config_files = [CONFIG]
  if mlperf_benchmark.TRANSFORMER in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForTransformer(
        benchmark_spec,
        vm,
        script_path,
        nvprof_flags,
        config_sed,
        run_sed,
        run_and_time_sed,
    )

  elif mlperf_benchmark.SSD in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForSSD(
        benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed
    )

  elif mlperf_benchmark.GNMT in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForGNMT(
        benchmark_spec, nvprof_flags, config_sed, run_sed, run_and_time_sed
    )

  elif mlperf_benchmark.MASK in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForMask(
        benchmark_spec,
        node_rank,
        script_path,
        nvprof_flags,
        config_sed,
        run_sed,
        run_and_time_sed,
    )

    config_files = ['config_DGXA100_multi_8x8x2.sh']

  elif mlperf_benchmark.RESNET in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForResnet(
        benchmark_spec,
        node_rank,
        nvprof_flags,
        config_sed,
        run_sed,
        run_and_time_sed,
    )

    config_files = ['config_DGXA100_common.sh', 'config_DGXA100_multi_8x8x*.sh']
    mlperf_benchmark.UpdateScriptForSmallGpuMem(vm)

  elif mlperf_benchmark.BERT in benchmark:
    config_sed, run_sed, run_and_time_sed = _GetChangesForBert(
        benchmark_spec,
        node_rank,
        nvprof_flags,
        config_sed,
        run_sed,
        run_and_time_sed,
    )

    config_files = ['config_DGXA100_common.sh', 'config_DGXA100_8x8x48x1.sh']

  vm.RemoteCommand(
      f'cd {script_path} && '
      f'sed "{mlperf_benchmark.SedPairsToString(config_sed)}" '
      f'{" ".join(config_files)} > {CONFIG} && '
      f'chmod 755 {CONFIG} '
  )

  vm.RemoteCommand(
      f'cd {script_path} && '
      f'sed "{mlperf_benchmark.SedPairsToString(run_and_time_sed)}" '
      f'run_and_time.sh | sed "2 i source {CONFIG}" > run_and_time1.sh && '
      'chmod 755 run_and_time1.sh '
  )

  vm.RemoteCommand(
      f'cd {script_path} && '
      f'sed "{mlperf_benchmark.SedPairsToString(run_sed)}" run_with_docker.sh '
      f'| sed "2 i source {CONFIG}" > run_with_docker1.sh && '
      'chmod 755 run_with_docker1.sh'
  )

  docker_file = posixpath.join(script_path, 'Dockerfile')
  if FLAGS.nccl_net_plugin:
    vm_util.ReplaceText(
        vm,
        'RUN apt-get update',
        r'RUN echo \"deb https:\/\/packages.cloud.google.com\/apt '
        r'google-fast-socket main\" | '
        r'tee \/etc\/apt\/sources.list.d\/google-fast-socket.list\n'
        r'RUN curl -s -L '
        r'https:\/\/packages.cloud.google.com\/apt\/doc\/apt-key.gpg | '
        r'apt-key add -\n'
        r'RUN rm -f \/opt\/hpcx\/nccl_rdma_sharp_plugin\/lib\/libnccl-net.so\n'
        r'RUN apt-get update',
        docker_file,
    )
    vm_util.ReplaceText(
        vm,
        'apt-get install -y --no-install-recommends',
        'apt-get install -y --no-install-recommends google-fast-socket',
        docker_file,
    )

  if FLAGS.aws_efa:
    vm.RemoteCommand(f'git clone {AWS_EFA_NCCL_BASEAMI_PIPELINE_URL}')
    vm.RemoteCommand(
        'cd aws-efa-nccl-baseami-pipeline && git checkout'
        f' {NVIDIA_EFA_DOCKERFILE_COMMIT}'
    )
    vm.RemoteCommand(f'cat {NVIDIA_EFA_DOCKERFILE} >> {docker_file}')
    vm.RemoteCommand(
        'echo "RUN rm -rf /opt/hpcx/ompi && ln -s /opt/amazon/openmpi'
        f' /opt/hpcx/ompi" >> {docker_file}'
    )
    vm_util.ReplaceText(vm, 'FROM nvcr.*', '', docker_file)
    vm_util.ReplaceText(vm, 'yum-utils.*', '', docker_file)
    vm_util.ReplaceText(
        vm, 'python3-distutils.*', 'python3-distutils', docker_file
    )
    vm_util.ReplaceText(vm, 'cmake', '', docker_file)