perfkitbenchmarker/linux_packages/slurm.py (125 lines of code) (raw):
"""Module containing slurm installation and cleanup function."""
import os
import re
from perfkitbenchmarker import data
from perfkitbenchmarker import linux_packages
from perfkitbenchmarker import vm_util
SLURM_CONF_DIR = '/etc/slurm'
def AptInstall(vm):
"""Installs slurm."""
# Tested on AWS AmazonLinux DLAMI
# The following command replicate a similar environment on GCP DLVM image.
@vm_util.Retry(
poll_interval=30,
timeout=-1,
max_retries=5,
)
def _InstallAnsible(vm):
vm.RemoteCommand('sudo pip3 install ansible')
vm.RemoteCommand(
'ansible-galaxy role install '
'googlecloudplatform.google_cloud_ops_agents')
vm.RemoteCommand(
'ansible-pull -U https://github.com/GoogleCloudPlatform/slurm-gcp '
'-C 5.9.1 -i localhost, --limit localhost --connection=local '
'''--extra-vars "{'slurm_version':'22.05.9','reboot':false,'install_cuda':false,'install_ompi':false,'install_lustre':false,'install_gcsfuse':false}" '''
'ansible/playbook.yml')
_InstallAnsible(vm)
vm.RemoteCommand(
'arch=$(dpkg --print-architecture); '
'curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/'
'v3.4.1/enroot_3.4.1-1_${arch}.deb; '
'curl -fSsL -O https://github.com/NVIDIA/enroot/releases/download/'
'v3.4.1/enroot+caps_3.4.1-1_${arch}.deb; '
'sudo apt-get update ; sudo apt-get install --assume-yes ./*.deb')
vm.RemoteCommand('sudo mkdir /run/enroot; sudo chmod 755 /run/enroot')
enroot_path = os.path.join(vm.GetScratchDir(), '${UID}/enroot/')
vm.RemoteCommand(
f'echo "ENROOT_RUNTIME_PATH {enroot_path}/runtime" | '
'sudo tee -a /etc/enroot/enroot.conf'
)
vm.RemoteCommand(
f'echo "ENROOT_CACHE_PATH {enroot_path}/cache" | '
'sudo tee -a /etc/enroot/enroot.conf'
)
vm.RemoteCommand(
f'echo "ENROOT_DATA_PATH {enroot_path}/data" | '
'sudo tee -a /etc/enroot/enroot.conf'
)
# Install pyxis
vm.RemoteCommand(
f'cd {linux_packages.INSTALL_DIR};'
'git clone --depth 1 https://github.com/NVIDIA/pyxis.git && '
'cd pyxis && sudo make install'
)
def ConfigureSlurm(vms):
"""Configures slurm cluster."""
cfg_path = data.ResourcePath('slurm.conf.j2')
controller = vms[0] # using 1st vm as controller host
workers = vms
tmp_slurm_cfg = os.path.join(linux_packages.INSTALL_DIR, 'slurm.conf')
slurm_cfg = os.path.join(SLURM_CONF_DIR, 'slurm.conf')
for vm in workers:
lscpu = vm.CheckLsCpu()
vm.RemoteCommand(
'echo "required /usr/local/lib/slurm/spank_pyxis.so '
'container_scope=global" | '
'sudo tee /etc/slurm/plugstack.conf'
)
vm.RemoteCommand(
f'mkdir -p {linux_packages.INSTALL_DIR}/slurmd', ignore_failure=True
)
vm.RenderTemplate(
cfg_path,
tmp_slurm_cfg,
{
'controller': controller.hostname,
'workers': ','.join(worker.hostname for worker in vms[1:]),
'cpus': vm.num_cpus,
'user': vm.user_name,
'sockets': lscpu.socket_count,
'cores_per_socket': lscpu.cores_per_socket,
'threads_per_core': lscpu.threads_per_core,
},
)
cgroup_path = data.ResourcePath('cgroup.conf')
vm.PushFile(cgroup_path, linux_packages.INSTALL_DIR)
tmp_cgroup_cfg = os.path.join(linux_packages.INSTALL_DIR, 'cgroup.conf')
vm.RemoteCommand(f'sudo mkdir {SLURM_CONF_DIR}', ignore_failure=True)
vm.RemoteCommand(f'sudo cp {tmp_slurm_cfg} {SLURM_CONF_DIR}')
# Do not overwrite the cgroup.conf file if already exists.
vm.RemoteCommand(f'sudo cp -n {tmp_cgroup_cfg} {SLURM_CONF_DIR}')
vm.RemoteCommand('sudo systemctl stop slurmd.service', ignore_failure=True)
vm.RemoteCommand(f'sudo chmod 755 {slurm_cfg}')
vm.RemoteCommand(f'sudo slurmd -f {slurm_cfg}')
controller.RemoteCommand(f'sudo chmod 755 {linux_packages.INSTALL_DIR}')
controller.RemoteCommand(
f'sudo chown {controller.user_name} {linux_packages.INSTALL_DIR}')
controller.RemoteCommand('sudo systemctl stop slurmctld.service',
ignore_failure=True)
controller.RemoteCommand(f'sudo slurmctld -f {slurm_cfg}')
# Setting up munge
controller.RemoteCommand(
f'sudo dd if=/dev/urandom of={linux_packages.INSTALL_DIR}/munge.key '
'bs=1 count=1024')
src_munge_key = f'{linux_packages.INSTALL_DIR}/munge.key'
# Distribute key from controller to the rest
for vm in vms[1:]:
controller.MoveFile(vm, src_munge_key, linux_packages.INSTALL_DIR)
# Setup permission and (re)start munged
for vm in vms:
vm.RemoteCommand(
f'sudo cp {src_munge_key} /etc/munge')
vm.RemoteCommand('sudo chmod 400 /etc/munge/munge.key')
vm.RemoteCommand('sudo chown munge:munge /etc/munge/munge.key')
vm.RemoteCommand('sudo pkill munged', ignore_failure=True)
vm.RemoteCommand('sudo mkdir /run/munge', ignore_failure=True)
vm.RemoteCommand('sudo munged --force')
def Running(vm):
"""Check if any slurm job is running."""
output, _ = vm.RemoteCommand('sinfo')
for line in output.splitlines():
if not line:
continue
status = line.split()[4]
if status in ('alloc', 'mix'):
return True
return False
# TODO(yuyanting) Figure out how to set slurm node priority, so the output
# always land on node0.
def GetController(vms):
"""Return the controller vm.
e.g.
ip-10-0-0-[28,175], return vm with hostname ip-10-0-0-28
pkb-46b6c6e7-[0-1], return vm with hostname pkb-46b6c6e7-0
Args:
vms: List of virtual machine objects.
Returns:
VirtualMachine object representing controller vm.
Raises:
RuntimeError: if cannot find the controller vm.
"""
output, _ = vms[0].RemoteCommand('sinfo')
node_list = output.strip().split()[-1]
prefix = node_list.split('[')[0]
suffix = re.split(',|-', node_list.split('[')[1])[0]
for vm in vms:
if vm.hostname == prefix + suffix:
return vm
raise RuntimeError(f'Not able to find controller vm in {output}')