community/pbs/scripts/startup-script.py (260 lines of code) (raw):
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Configure PBS Pro Open Source Edition cluster on GCP (https://www.pbspro.org/)
# Create deployment: "gcloud deployment-manager deployments --project=<project-name> create <deployment-name> --config pbs-cluster.yaml"
# Clean-up (delete) deployment: "gcloud deployment-manager deployments delete --project=<project-name> <deployment-name>"
#
import httplib
import os
import shlex
import subprocess
import time
import urllib
import urllib2
import socket
CLUSTER_NAME = '@CLUSTER_NAME@'
MACHINE_TYPE = '@MACHINE_TYPE@' # e.g. n1-standard-1, n1-starndard-2
INSTANCE_TYPE = '@INSTANCE_TYPE@' #'controller' #'@INSTANCE_TYPE@' # e.g. controller or compute
PROJECT = '@PROJECT@'
ZONE = '@ZONE@'
APPS_DIR = '/apps'
PBS_VERSION = '@PBS_VERSION@'
STATIC_NODE_COUNT = @STATIC_NODE_COUNT@
COMPUTE_PUBLIC_IPS = @COMPUTE_PUBLIC_IPS@
PBS_PREFIX = APPS_DIR + '/pbs/pbs-' + PBS_VERSION
INSTANCE_NAME_PREFIX = '@INSTANCE_NAME_PREFIX@'
MOTD_HEADER = '''
********* PBS PRO *********
qsub submit a pbs job
qdel delete pbs batch job
qhold hold pbs batch jobs
qrls release hold on pbs batch jobs
qstat -q list all queues
qstat -a list all jobs
qstat -u userid list jobs for userid
qstat -r list running jobs
qstat -f job_id list full information about job_id
qstat -Qf queue list full information about queue
qstat -B list summary status of the job server
pbsnodes -a list status of all compute nodes
tracejob Extracts job info from log files
sudo /etc/init.d/pbs restart Restart PBS
export PATH=$PATH:/opt/pbs/bin/
'''
NodeName=""
def add_pbs_user():
PBS_UID = str(992)
subprocess.call(['groupadd', '-g', PBS_UID, 'pbs'])
subprocess.call(['useradd', '-m', '-c', 'PBS Workload Manager',
'-d', '/var/lib/pbs', '-u', PBS_UID, '-g', 'pbs',
'-s', '/bin/bash', 'pbs'])
# END add_pbs_user()
def start_motd():
msg = MOTD_HEADER + """
*** PBS is currently being installed/configured in the background. ***
A terminal broadcast will announce when installation and configuration is
complete.
You can check startup log messages in /var/log/messages
To run startup script again: sudo google_metadata_script_runner --script-type startup
More info: https://cloud.google.com/compute/docs/startupscript#rerunthescript
"""
if INSTANCE_TYPE != "controller":
msg += """/home on the controller will be mounted over the existing /home.
Any changes in /home will be hidden. Please wait until the installation is
complete before making changes in your home directory.
"""
f = open('/etc/motd', 'w')
f.write(msg)
f.close()
# END start_motd()
def end_motd():
f = open('/etc/motd', 'w')
f.write(MOTD_HEADER)
f.close()
subprocess.call(['wall', '-n',
'*** PBS ' + INSTANCE_TYPE + ' daemon installation complete ***'])
if INSTANCE_TYPE != "controller":
subprocess.call(['wall', '-n', """
/home on the controller was mounted over the existing /home.
Either log out and log back in or cd into ~.
"""])
#END start_motd()
def have_internet():
conn = httplib.HTTPConnection("www.google.com", timeout=1)
try:
conn.request("HEAD", "/")
conn.close()
return True
except:
conn.close()
return False
#END have_internet()
def install_packages():
packages = [
'munge',
'munge-devel',
'munge-libs',
'wget',
'gcc',
'make',
'rpm-build',
'libtool',
'hwloc-devel',
'libX11-devel',
'libXt-devel',
'libedit-devel',
'libical-devel',
'ncurses-devel',
'perl',
'postgresql-devel',
'python-devel',
'tcl-devel',
'tk-devel',
'swig',
'expat-devel',
'openssl-devel',
'libXext',
'libXft',
'expat',
'libedit',
'postgresql-server',
'python',
'sendmail',
'sudo',
'tcl',
'tk',
'libical',
'unzip',
'nfs-utils',
'nfs-utils-lib'
]
while subprocess.call(['yum', 'install', '-y'] + packages):
print "yum failed to install packages. Trying again in 5 seconds"
time.sleep(5)
#END install_packages()
def setup_nfs_exports():
f = open('/etc/exports', 'w')
f.write("""
/home *(rw,sync,no_subtree_check,no_root_squash)
%s *(rw,sync,no_subtree_check,no_root_squash)
""" % APPS_DIR)
f.close()
subprocess.call(shlex.split("exportfs -a"))
#END setup_nfs_exports()
def expand_machine_type():
# Force re-evaluation of site-packages so that namespace packages (such
# as google-auth) are importable. This is needed because we install the
# packages while this script is running and do not have the benefit of
# restarting the interpreter for it to do it's usual startup sequence to
# configure import magic.
import sys
import site
for path in [x for x in sys.path if 'site-packages' in x]:
site.addsitedir(path)
import googleapiclient.discovery
# Assume sockets is 1. Currently, no instances with multiple sockets
# Assume hyper-threading is on and 2 threads per core
machine = {'sockets': 1, 'cores': 1, 'threads': 1, 'memory': 1}
try:
compute = googleapiclient.discovery.build('compute', 'v1',
cache_discovery=False)
type_resp = compute.machineTypes().get(project=PROJECT, zone=ZONE,
machineType=MACHINE_TYPE).execute()
if type_resp:
tot_cpus = type_resp['guestCpus']
if tot_cpus > 1:
machine['cores'] = tot_cpus / 2
machine['threads'] = 2
# Because the actual memory on the host will be different than what
# is configured (e.g. kernel will take it). From experiments, about
# 16 MB per GB are used (plus about 400 MB buffer for the first
# couple of GB's. Using 30 MB to be safe.
gb = type_resp['memoryMb'] / 1024;
machine['memory'] = type_resp['memoryMb'] - (400 + (gb * 30))
except Exception, e:
print "Failed to get MachineType '%s' from google api (%s)" % (MACHINE_TYPE, str(e))
return machine
#END expand_machine_type()
def install_pbs():
#Ex. Source http://wpc.23a7.iotacdn.net/8023A7/origin2/rl/PBS-Open/pbspro_18.1.2.centos7.zip
#Ex2: https://s3.amazonaws.com/pbspro/pbspro-server-18.1.1-0.x86_64.rpm
#Ex3: https://github.com/PBSPro/pbspro/releases/download/v18.1.3/pbspro_18.1.3.centos7.zip
BASE_URL = 'https://github.com/PBSPro/pbspro/releases/download/'
file = 'v' + PBS_VERSION + '/pbspro_' + PBS_VERSION + '.centos7.zip'
print "Will download %s to /tmp/ directory" % file
urllib.urlretrieve(BASE_URL + file, '/tmp/pbs.zip')
prev_path = os.getcwd()
os.chdir('/tmp')
pkgPbs = "/tmp/" + file
subprocess.call(['unzip', '-o', 'pbs.zip'])
os.chdir('/tmp/pbspro_' + PBS_VERSION + '.centos7')
subprocess.call(['yum', 'install', '-y', 'pbspro-server-' + PBS_VERSION + '-0.x86_64.rpm'])
#END install_pbs()
def install_pbs_tmpfile():
run_dir = '/var/run/pbs'
f = open('/etc/tmpfiles.d/pbs.conf', 'w')
f.write("""
d %s 0755 pbs pbs -
""" % run_dir)
f.close()
if not os.path.exists(run_dir):
os.makedirs(run_dir)
os.chmod(run_dir, 0o755)
subprocess.call(['chown', 'pbs:', run_dir])
#END install_pbs_tmpfile()
def install_controller_service_scripts():
install_pbs_tmpfile()
# pbsctld.service
f = open('/usr/lib/systemd/system/pbsctld.service', 'w')
f.write("""
[Unit]
Description=PBS controller daemon
After=network.target munge.service
ConditionPathExists={prefix}/etc/pbs.conf
[Service]
Type=forking
EnvironmentFile=-/etc/sysconfig/pbsctld
ExecStart={prefix}/sbin/pbsctld $PBSCTLD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
PIDFile=/var/run/pbs/pbsctld.pid
[Install]
WantedBy=multi-user.target
""".format(prefix = PBS_PREFIX))
f.close()
os.chmod('/usr/lib/systemd/system/pbsctld.service', 0o644)
# pbsdbd.service
f = open('/usr/lib/systemd/system/pbsdbd.service', 'w')
f.write("""
[Unit]
Description=PPS DBD accounting daemon
After=network.target munge.service
ConditionPathExists={prefix}/etc/pbsdbd.conf
[Service]
Type=forking
EnvironmentFile=-/etc/sysconfig/pbsdbd
ExecStart={prefix}/sbin/pbsdbd $PBSDBD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
PIDFile=/var/run/pbs/pbsdbd.pid
[Install]
WantedBy=multi-user.target
""".format(prefix = APPS_DIR + "/pbs/current"))
f.close()
os.chmod('/usr/lib/systemd/system/pbsdbd.service', 0o644)
#END install_controller_service_scripts()
def install_compute_service_scripts():
install_pbs_tmpfile()
#config {test}
f = open('/etc/pbs.conf', 'w')
f.write("""
PBS_EXEC=/opt/pbs
PBS_SERVER={instance_prefix}controller
PBS_START_SERVER=0
PBS_START_SCHED=0
PBS_START_COMM=1
PBS_START_MOM=1
PBS_HOME=/var/spool/pbs
PBS_CORE_LIMIT=unlimited
PBS_SCP=/bin/scp
""".format(instance_prefix = INSTANCE_NAME_PREFIX))
f.close()
# pbsd.service
f = open('/usr/lib/systemd/system/pbsd.service', 'w')
f.write("""
[Unit]
Description=pbs node daemon
After=network.target munge.service
ConditionPathExists={prefix}/etc/pbs.conf
[Service]
Type=forking
EnvironmentFile=-/etc/sysconfig/pbsd
ExecStart={prefix}/sbin/pbsd $PBSD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
PIDFile=/var/run/pbs/pbsd.pid
KillMode=process
LimitNOFILE=51200
LimitMEMLOCK=infinity
LimitSTACK=infinity
[Install]
WantedBy=multi-user.target
""".format(prefix = APPS_DIR + "/pbs/current"))
f.close()
os.chmod('/usr/lib/systemd/system/pbsd.service', 0o644)
#END install_compute_service_scripts()
def setup_bash_profile():
f = open('/etc/profile.d/pbs.sh', 'w')
f.write("""
S_PATH=/opt/pbs
PATH=$PATH:$S_PATH/bin
""")
f.close()
#END setup_bash_profile()
def mount_nfs_vols():
f = open('/etc/fstab', 'a')
f.write("""
{1}controller:{0} {0} nfs rw,sync,hard,intr 0 0
{1}controller:/home /home nfs rw,sync,hard,intr 0 0
""".format(APPS_DIR, INSTANCE_NAME_PREFIX))
f.close()
while subprocess.call(['mount', '-a']):
print "Waiting for " + APPS_DIR + " and /home to be mounted"
time.sleep(5)
#END mount_nfs_vols()
def test_pbs():
urllib.urlretrieve('https://s3.amazonaws.com/pbspro/test.zip', '/tmp/test.zip')
os.chdir('/tmp')
subprocess.call(['unzip', '-o', 'test.zip'])
#install pip
urllib.urlretrieve('https://bootstrap.pypa.io/get-pip.py', '/tmp/get-pip.py')
subprocess.call(['python', 'get-pip.py'])
os.chdir('/tmp/test/fw')
subprocess.call(shlex.split('pip install -r requirements.txt .'))
subprocess.call(shlex.split('pbs_config --make-ug'))
os.chdir('/tmp/test/tests')
subprocess.call(shlex.split('pbs_benchpress -l INFOCLI2 -o ptl.txt'))
#END test_pbs()
def register_compute_nodes():
host = socket.gethostname();
for node_id in range(1, STATIC_NODE_COUNT+1):
cmd_create_node = "/opt/pbs/bin/qmgr -c 'create node " + INSTANCE_NAME_PREFIX + "compute" + str(node_id) + "'"
print "will execute: " + cmd_create_node
subprocess.call(shlex.split(cmd_create_node))
#END create_node()
def main():
# Disable SELinux
subprocess.call(shlex.split('setenforce 0'))
print "ARGUMENT STATIC_NODE_COUNT: @STATIC_NODE_COUNT@"
#if ((INSTANCE_TYPE == "controller") and not COMPUTE_PUBLIC_IPS):
if (INSTANCE_TYPE == "controller"):
# Setup a NAT gateway for the compute instances to get internet from.
subprocess.call(shlex.split("sysctl -w net.ipv4.ip_forward=1"))
subprocess.call(shlex.split("firewall-cmd --direct --add-rule ipv4 nat POSTROUTING 0 -o eth0 -j MASQUERADE"))
subprocess.call(shlex.split("firewall-cmd --reload"))
subprocess.call(shlex.split("echo net.ipv4.ip_forward=1 >> /etc/sysctl.conf"))
if INSTANCE_TYPE == "compute":
while not have_internet():
print "Waiting for internet connection"
add_pbs_user()
start_motd()
print "Installing packages..."
install_packages()
if not os.path.exists(APPS_DIR + '/pbs'):
os.makedirs(APPS_DIR + '/pbs')
if INSTANCE_TYPE != "controller":
mount_nfs_vols()
if INSTANCE_TYPE == "controller":
print "Installing PBS on controller node..."
install_pbs()
print "Installing PBS service scripts..."
install_controller_service_scripts()
print "Starting PBS process..."
subprocess.call(shlex.split('/etc/init.d/pbs start'))
print "Registering compute nodes ..."
register_compute_nodes()
# Export at the end to signal that everything is up
subprocess.call(shlex.split('systemctl enable nfs-server'))
subprocess.call(shlex.split('systemctl start nfs-server'))
setup_nfs_exports()
elif INSTANCE_TYPE == "compute":
print "Installing PBS on compute node..."
install_pbs()
install_compute_service_scripts()
subprocess.call(shlex.split('systemctl enable pbsd'))
subprocess.call(shlex.split('systemctl start pbsd'))
print "Installed additional components on compute node"
end_motd()
print "Setting up bash profile..."
setup_bash_profile()
print "Restarting PBS to get activate new configuration..."
subprocess.call(shlex.split("/etc/init.d/pbs restart"))
subprocess.call(["wall", "Completed PBS installation"])
#test_pbs()
print "Completed PBS installation on " + INSTANCE_TYPE
# END main()
if __name__ == '__main__':
main()