VMExtension/hpcnodemanager.py (619 lines of code) (raw):
#!/usr/bin/env python
#
# HPCNodeManager extension
#
# Copyright 2015 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import json
import subprocess
import re
import time
import traceback
import socket
import shutil
import platform
import struct
import array
import fcntl
import hashlib
from Utils.WAAgentUtil import waagent
import Utils.HandlerUtil as Util
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.version import get_distro
#Define global variables
ExtensionShortName = 'HPCNodeManager'
DaemonPidFilePath = '/var/run/hpcnmdaemon.pid'
InstallRoot = '/opt/hpcnodemanager'
DistroName = None
DistroVersion = None
CGroupV2 = False
RestartIntervalInSeconds = 60
osutil = None
def main():
waagent.LoggerInit('/var/log/waagent.log','/dev/stdout')
waagent.Log('Microsoft.HpcPack Linux NodeAgent started to handle.')
global DistroName, DistroVersion, osutil, CGroupV2
distro = get_dist_info()
DistroName = distro[0].lower()
# waagent common lib returns 'rhel' than 'redhat' since 9, rename to 'redhat' here to minimize the change needed
if DistroName == 'rhel':
DistroName = 'redhat'
DistroVersion = distro[1]
CGroupV2 = os.path.exists("/sys/fs/cgroup/cgroup.controllers")
osutil = get_osutil()
for a in sys.argv[1:]:
if re.match("^([-/]*)(disable)", a):
disable()
elif re.match("^([-/]*)(uninstall)", a):
uninstall()
elif re.match("^([-/]*)(install)", a):
install()
elif re.match("^([-/]*)(enable)", a):
enable()
elif re.match("^([-/]*)(daemon)", a):
daemon()
elif re.match("^([-/]*)(update)", a):
update()
def _is_nodemanager_daemon(pid):
retcode, output = waagent.RunGetOutput("ps -p {0} -o cmd=".format(pid))
if retcode == 0:
waagent.Log("The cmd for process {0} is {1}".format(pid, output))
pattern = r'(.*[/\s])?{0}\s+[-/]*daemon$'.format(os.path.basename(__file__))
if re.match(pattern, output):
return True
waagent.Log("The process {0} is not HPC Linux node manager daemon".format(pid))
return False
def install_package(package_name):
if DistroName in ["centos", "redhat", "almalinux", "rocky"]:
cmd = "yum -y install " + package_name
elif DistroName == "ubuntu":
waagent.Log("Updating apt package lists with command: apt-get -y update")
exitcode = waagent.Run("apt-get -y update", chk_err=False)
if exitcode != 0:
waagent.Log("Update apt package lists failed with exitcode: {0}".format(exitcode))
cmd = "apt-get -y install " + package_name
elif DistroName == "suse":
if not os.listdir('/etc/zypp/repos.d'):
waagent.Run("zypper ar http://download.opensuse.org/distribution/13.2/repo/oss/suse/ opensuse")
cmd = "zypper -n --gpg-auto-import-keys install --force-resolution -l " + package_name
else:
cmd = "zypper -n install --force-resolution -l " + package_name
else:
raise Exception("Unsupported Linux Distro.")
waagent.Log("The command to install {0}: {1}".format(package_name, cmd))
attempt = 1
while(True):
waagent.Log("Installing package {0} (Attempt {1})".format(package_name, attempt))
retcode, retoutput = waagent.RunGetOutput(cmd)
if retcode == 0:
waagent.Log("package {0} installation succeeded".format(package_name))
break
else:
waagent.Log("package {0} installation failed {1}:\n {2}".format(package_name, retcode, retoutput))
if attempt < 10:
time.sleep(min(30, pow(2, attempt)))
attempt += 1
if DistroName == 'suse' and retcode == 104:
waagent.Run("zypper ar http://download.opensuse.org/distribution/13.2/repo/oss/suse/ opensuse")
cmd = "zypper -n --gpg-auto-import-keys install --force-resolution -l " + package_name
elif DistroName == "ubuntu":
waagent.Run("apt-get -y update", chk_err=False)
continue
else:
raise Exception("failed to install package {0}:{1}".format(package_name, retcode))
def _uninstall_nodemanager_files():
if os.path.isdir(InstallRoot):
for tmpname in os.listdir(InstallRoot):
if tmpname == 'logs':
continue
if tmpname == 'certs':
continue
if tmpname == 'filters':
continue
tmppath = os.path.join(InstallRoot, tmpname)
if os.path.isdir(tmppath):
shutil.rmtree(tmppath)
elif os.path.isfile(tmppath):
os.remove(tmppath)
def _install_cgroup_tool():
if CGroupV2:
waagent.Log("cgroup v2 enabled, skip cgroup tools installation")
elif waagent.Run("command -v cgexec", chk_err=False) == 0:
waagent.Log("cgroup tools was already installed")
else:
waagent.Log("Start to install cgroup tools")
if DistroName == "ubuntu":
if re.match("^1", DistroVersion):
cg_pkgname = 'cgroup-bin'
else:
cg_pkgname = 'cgroup-tools'
elif (DistroName == "centos" or DistroName == "redhat") and re.match("^6", DistroVersion):
cg_pkgname = 'libcgroup'
else:
cg_pkgname = 'libcgroup-tools'
install_package(cg_pkgname)
waagent.Log("cgroup tool was successfully installed")
def _install_sysstat():
if waagent.Run("command -v iostat", chk_err=False) == 0:
waagent.Log("sysstat was already installed")
else:
waagent.Log("Start to install sysstat")
install_package('sysstat')
waagent.Log("sysstat was successfully installed")
def _install_pstree():
if waagent.Run("command -v pstree", chk_err=False) == 0:
waagent.Log("pstree was already installed")
else:
waagent.Log("Start to install pstree")
install_package('psmisc')
waagent.Log("pstree was successfully installed")
def get_networkinterfaces():
"""
Return the interface name, and ip addr of the
all non loopback interfaces.
"""
expected=16 # how many devices should I expect...
is_64bits = sys.maxsize > 2**32
struct_size=40 if is_64bits else 32 # for 64bit the size is 40 bytes, for 32bits it is 32 bytes.
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
buff=array.array('B', b'\0' * (expected*struct_size))
retsize=(struct.unpack('iL', fcntl.ioctl(s.fileno(), 0x8912, struct.pack('iL',expected*struct_size,buff.buffer_info()[0]))))[0]
if retsize == (expected*struct_size) :
waagent.Log('SIOCGIFCONF returned more than ' + str(expected) + ' up network interfaces.')
nics = []
s=buff.tostring()
for i in range(0,retsize,struct_size):
iface=s[i:i+16].split(b'\0', 1)[0]
if iface == b'lo':
continue
else:
nics.append((iface.decode('latin-1'), socket.inet_ntoa(s[i+20:i+24])))
return nics
def cleanup_host_entries():
hostsfile = '/etc/hosts'
if not os.path.isfile(hostsfile):
return
try:
hpcentryexists = False
newcontent=''
with open(hostsfile, 'r') as F:
for line in F.readlines():
if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPCD?\s*$", line):
hpcentryexists = True
else:
newcontent += line
if hpcentryexists:
waagent.Log("Clean all HPC related host entries from hosts file")
waagent.ReplaceFileContentsAtomic(hostsfile,newcontent)
os.chmod(hostsfile, 0o644)
except :
raise
def init_suse_hostsfile(host_name, ipaddrs):
hostsfile = '/etc/hosts'
if not os.path.isfile(hostsfile):
return
try:
newhpcd_entries = ''
for ipaddr in ipaddrs:
newhpcd_entries += '{0:24}{1:30}#HPCD\n'.format(ipaddr, host_name)
curhpcd_entries = ''
newcontent = ''
hpcentryexists = False
with open(hostsfile, 'r') as F:
for line in F.readlines():
if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPCD\s*$", line):
curhpcd_entries += line
hpcentryexists = True
elif re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPC\s*$", line):
hpcentryexists = True
else:
newcontent += line
if newhpcd_entries != curhpcd_entries:
if hpcentryexists:
waagent.Log("Clean the HPC related host entries from hosts file")
waagent.Log("Add the following HPCD host entries:\n{0}".format(newhpcd_entries))
if newcontent and newcontent[-1] != '\n':
newcontent += '\n'
newcontent += newhpcd_entries
waagent.ReplaceFileContentsAtomic(hostsfile,newcontent)
os.chmod(hostsfile, 0o644)
except :
raise
def gethostname_from_configfile(configfile):
config_hostname = None
if os.path.isfile(configfile):
with open(configfile, 'r') as F:
configjson = json.load(F)
if 'RegisterUri' in configjson:
reguri = configjson['RegisterUri']
reguri = reguri[0:reguri.rindex('/')]
config_hostname = reguri[reguri.rindex('/')+1:]
return config_hostname
def _add_dns_search(domain_fqdn):
need_update = False
new_content = ''
for line in (open('/etc/resolv.conf','r')).readlines():
if re.match('^search.* {0}'.format(domain_fqdn), line):
waagent.Log('{0} was already added in /etc/resolv.conf'.format(domain_fqdn))
return
if re.match('^search', line):
need_update = True
new_content += line.replace('search', 'search {0}'.format(domain_fqdn))
else:
new_content += line
if need_update:
waagent.Log('Adding {0} to /etc/resolv.conf'.format(domain_fqdn))
waagent.SetFileContents('/etc/resolv.conf', new_content)
def _update_dns_record(domain_fqdn):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
while True:
try:
s.connect((domain_fqdn, 53))
break
except Exception as e:
waagent.Log('Failed to connect to {0}:53: {1}'.format(domain_fqdn, e))
ipaddr = s.getsockname()[0]
host_fqdn = "{0}.{1}".format(socket.gethostname().split('.')[0], domain_fqdn)
dns_cmd = 'echo -e "server {0}\nzone {0}\nupdate delete {1}\nupdate add {1} 864000 A {2}\nsend\n" | nsupdate -v'.format(domain_fqdn, host_fqdn, ipaddr)
waagent.Log('The command to update ip to dns server is: {0}'.format(dns_cmd))
retry = 0
while retry < 60:
dns_ret, dns_msg = waagent.RunGetOutput(dns_cmd)
if dns_ret == 0:
waagent.Log("Succeeded to update ip to dns server.")
return
else:
retry = retry + 1
waagent.Log("Failed to update ip to dns server: {0}, {1}".format(dns_ret, dns_msg))
time.sleep(10)
def _mount_cgroup():
if not os.path.isdir('/cgroup'):
os.mkdir('/cgroup')
if not os.listdir('/cgroup'):
retcode, mount_msg = waagent.RunGetOutput('mount -t cgroup cgroup /cgroup')
waagent.Log("mount /cgroup directory {0}:{1}".format(retcode, mount_msg))
if retcode == 0:
waagent.Log("/cgroup directory is successfully mounted.")
else:
raise Exception("failed to mount /cgroup directory")
else:
waagent.Log("/cgroup directory was already mounted.")
def config_firewall_rules():
if DistroName == 'redhat':
waagent.Log('Configuring the firewall rules')
major_version = int(DistroVersion.split('.')[0])
if major_version < 7:
waagent.Run('lokkit --port=40000:tcp --update', chk_err=False)
waagent.Run('lokkit --port=40002:tcp --update', chk_err=False)
elif waagent.Run("firewall-cmd --state", chk_err=False) == 0:
waagent.Run("firewall-cmd --permanent --zone=public --add-port=40000/tcp")
waagent.Run("firewall-cmd --permanent --zone=public --add-port=40002/tcp")
waagent.Run("firewall-cmd --reload")
def parse_context(operation):
hutil = Util.HandlerUtility(waagent.Log, waagent.Error, ExtensionShortName)
hutil.do_parse_context(operation)
return hutil
def cmpFileHash(file1, file2):
if not (os.path.isfile(file1) and os.path.isfile(file2)):
return False
digests = []
for filename in [file1, file2]:
md5hash = hashlib.md5()
with open(filename, 'rb') as f:
buf = f.read()
md5hash.update(buf)
digest = md5hash.hexdigest()
digests.append(digest)
return digests[0] == digests[1]
def install():
hutil = parse_context('Install')
try:
cleanup_host_entries()
_uninstall_nodemanager_files()
if DistroName in ["centos", "redhat", "almalinux", "rocky"]:
waagent.Run("yum-config-manager --setopt=\\*.skip_if_unavailable=1 --save", chk_err=False)
_install_cgroup_tool()
_install_sysstat()
_install_pstree()
logDir = os.path.join(InstallRoot, "logs")
if not os.path.isdir(logDir):
os.makedirs(logDir)
srcDir = os.path.join(os.getcwd(), "bin")
waagent.RunGetOutput("chmod +x {0}/*".format(srcDir))
waagent.RunGetOutput("chmod +x {0}/lib/*".format(srcDir))
for filename in os.listdir(srcDir):
srcname = os.path.join(srcDir, filename)
destname = os.path.join(InstallRoot, filename)
if os.path.isfile(srcname):
shutil.copy2(srcname, destname)
elif os.path.isdir(srcname):
shutil.copytree(srcname, destname)
libdir = os.path.join(InstallRoot, 'lib')
for tmpname in os.listdir(libdir):
tmppath = os.path.join(libdir, tmpname)
if tmpname.endswith(".tar.gz") and os.path.isfile(tmppath):
waagent.Run("tar xzvf {0} -C {1}".format(tmppath, libdir))
os.remove(tmppath)
waagent.Run("chmod -R 755 {0}".format(libdir))
host_name = None
public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
if public_settings:
host_name = public_settings.get('HostName')
backup_configfile = os.path.join(os.getcwd(), 'nodemanager.json')
if not host_name:
# if there is backup nodemanager.json, means it is an update install, if 'HostName' not defined in the extension
# settings, we shall get from the backup nodemanager.json
if os.path.isfile(backup_configfile):
waagent.Log("Backup nodemanager configuration file found")
host_name = gethostname_from_configfile(backup_configfile)
curhostname = socket.gethostname().split('.')[0]
if host_name:
if host_name.lower() != curhostname.lower():
waagent.Log("HostName was set: hostname from {0} to {1}".format(curhostname, host_name))
osutil.set_hostname(host_name)
osutil.publish_hostname(host_name)
else:
host_name = curhostname
public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
protect_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('protectedSettings')
authentication_key = ""
if protect_settings is not None:
authentication_key = protect_settings.get('AuthenticationKey')
authentication_key = authentication_key if authentication_key is not None else ""
cluster_connstring = public_settings.get('ClusterConnectionString')
if not cluster_connstring:
waagent.Log("ClusterConnectionString is not specified")
cluster_connstring = protect_settings.get('ClusterName')
if not cluster_connstring:
error_msg = "neither ClusterConnectionString nor ClusterName is specified."
hutil.error(error_msg)
raise ValueError(error_msg)
ssl_thumbprint = public_settings.get('SSLThumbprint')
certsdir = os.path.join(InstallRoot, "certs")
if not ssl_thumbprint:
api_prefix = "http://{0}:80/HpcLinux/api/"
listen_uri = "http://0.0.0.0:40000"
else:
api_prefix = "https://{0}:443/HpcLinux/api/"
listen_uri = "https://0.0.0.0:40002"
# import the ssl certificate for hpc nodemanager
if not os.path.isdir(certsdir):
os.makedirs(certsdir, 0o750)
else:
os.chmod(certsdir, 0o750)
ssl_thumbprint = ssl_thumbprint.upper()
prvfile = os.path.join("/var/lib/waagent", ssl_thumbprint + ".prv")
srccrtfile = os.path.join("/var/lib/waagent", ssl_thumbprint + ".crt")
rsakeyfile = os.path.join(certsdir, "nodemanager_rsa.key")
dstcrtfile = os.path.join(certsdir, "nodemanager.crt")
if os.path.isfile(prvfile) and not cmpFileHash(prvfile, rsakeyfile):
waagent.Run("rm -rf {0}/nodemanager.crt {0}/nodemanager.key {0}/nodemanager.pem {0}/nodemanager_rsa.key".format(certsdir), chk_err=False)
shutil.copy2(prvfile, rsakeyfile)
shutil.copy2(srccrtfile, dstcrtfile)
shutil.copy2(dstcrtfile, os.path.join(certsdir, "nodemanager.pem"))
waagent.Run("openssl rsa -in {0}/nodemanager_rsa.key -out {0}/nodemanager.key".format(certsdir))
waagent.Run("chmod 640 {0}/nodemanager.crt {0}/nodemanager.key {0}/nodemanager.pem {0}/nodemanager_rsa.key".format(certsdir))
node_uri = api_prefix + host_name + "/computenodereported"
reg_uri = api_prefix + host_name + "/registerrequested"
hostsfile_uri = api_prefix + "hostsfile"
metric_ids_uri = api_prefix + host_name + "/getinstanceids"
namingSvcUris = ['https://{0}:443/HpcNaming/api/fabric/resolve/singleton/'.format(h.split('.')[0].strip()) for h in cluster_connstring.split(',')]
if os.path.isfile(backup_configfile):
with open(backup_configfile, 'r') as F:
configjson = json.load(F)
configjson["NamingServiceUri"] = namingSvcUris
configjson["HeartbeatUri"] = node_uri
configjson["RegisterUri"] = reg_uri
configjson["HostsFileUri"] = hostsfile_uri
configjson["MetricInstanceIdsUri"] = metric_ids_uri
configjson["MetricUri"] = ""
configjson["ListeningUri"] = listen_uri
else:
configjson = {
"ConfigVersion": "1.0",
"NamingServiceUri": namingSvcUris,
"HeartbeatUri": node_uri,
"RegisterUri": reg_uri,
"MetricUri": "",
"MetricInstanceIdsUri": metric_ids_uri,
"HostsFileUri": hostsfile_uri,
"HostsFetchInterval": 120,
"ListeningUri": listen_uri,
"DefaultServiceName": "SchedulerStatefulService",
"UdpMetricServiceName": "MonitoringStatefulService"
}
if ssl_thumbprint:
configjson["TrustedCAFile"] = os.path.join(certsdir, "nodemanager.pem")
configjson["CertificateChainFile"] = os.path.join(certsdir, "nodemanager.crt")
configjson["PrivateKeyFile"] = os.path.join(certsdir, "nodemanager.key")
if authentication_key:
configjson["ClusterAuthenticationKey"] = authentication_key
configfile = os.path.join(InstallRoot, 'nodemanager.json')
waagent.SetFileContents(configfile, json.dumps(configjson))
shutil.copy2(configfile, backup_configfile)
config_firewall_rules()
if CGroupV2:
shutil.copy2(os.path.join(InstallRoot, "hpccgroot.service"), "/etc/systemd/system/hpccgroot.service")
waagent.Run("systemctl daemon-reload")
waagent.Run("systemctl enable hpccgroot.service")
waagent.Run("systemctl restart hpccgroot.service")
hutil.do_exit(0, 'Install', 'success', '0', 'Install Succeeded.')
except Exception as e:
hutil.do_exit(1, 'Install','error','1', '{0}'.format(e))
def enable():
#Always restart daemon and clear PID file
hutil = parse_context('Enable')
if os.path.isfile(DaemonPidFilePath):
pid = waagent.GetFileContents(DaemonPidFilePath)
if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid):
waagent.Log("Stop old daemon: {0}".format(pid))
os.killpg(int(pid), 9)
os.remove(DaemonPidFilePath)
args = [get_python_executor(), os.path.join(os.getcwd(), __file__), "daemon"]
devnull = open(os.devnull, 'w')
child = subprocess.Popen(args, stdout=devnull, stderr=devnull, preexec_fn=os.setsid)
if child.pid is None or child.pid < 1:
hutil.do_exit(1, 'Enable', 'error', '1',
'Failed to launch HPC Linux node manager daemon')
else:
hutil.save_seq()
waagent.SetFileContents(DaemonPidFilePath, str(child.pid))
#Sleep 3 seconds to check if the process is still running
time.sleep(3)
if child.poll() is None:
waagent.Log("Daemon pid: {0}".format(child.pid))
hutil.do_exit(0, 'Enable', 'success', '0',
'HPC Linux node manager daemon is enabled')
else:
hutil.do_exit(1, 'Enable', 'error', '2',
'Failed to launch HPC Linux node manager daemon')
def daemon():
hutil = parse_context('Enable')
if CGroupV2:
service_dir = '/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service'
if not os.path.exists(service_dir):
os.makedirs(service_dir)
with open('/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.procs', 'r') as f:
pids = f.read().splitlines()
for pid in pids:
waagent.Run("echo {0} > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service/cgroup.procs".format(pid))
waagent.Run('echo "+cpu +cpuset +memory" > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.subtree_control')
try:
public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
domain_fqdn = public_settings.get('DomainName')
if not domain_fqdn:
cluster_connstring = public_settings.get('ClusterConnectionString')
if not cluster_connstring:
waagent.Log("ClusterConnectionString is not specified, use ClusterName instead")
protect_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('protectedSettings')
cluster_connstring = protect_settings.get('ClusterName')
headnode_name = cluster_connstring.split(',')[0].strip()
if headnode_name.find('.') > 0:
# The head node name is FQDN, extract the domain FQDN
domain_fqdn = headnode_name.split(".", 1)[1]
if domain_fqdn:
waagent.Log("The domain FQDN is " + domain_fqdn)
_add_dns_search(domain_fqdn)
#thread.start_new_thread(_update_dns_record, (domain_fqdn,))
# A fix only for SUSE Linux that sometimes the hostname got changed because out-of-date host/IP entry in /etc/hosts
# It may happen when the node was assigned a different IP after deallocation
# We shall clean the current HPC related host/IP entries and add the actual IPs before fetching the hosts file from head node.
if DistroName == 'suse':
configfile = os.path.join(InstallRoot, 'nodemanager.json')
confighostname = gethostname_from_configfile(configfile)
curhostname = socket.gethostname().split('.')[0]
if confighostname.lower() != curhostname.lower():
cleanup_host_entries()
waagent.Log("Correct the hostname from {0} to {1}".format(curhostname, confighostname))
osutil.set_hostname(confighostname)
osutil.publish_hostname(confighostname)
retry = 0
while True:
nics = get_networkinterfaces()
if len(nics) > 0:
init_suse_hostsfile(confighostname, [nic[1] for nic in nics])
break
elif retry < 30:
waagent.Log("Failed to get network interfaces information, retry later ...")
time.sleep(2)
retry = retry + 1
else:
waagent.Log("Failed to get network interfaces information, just clean")
break
# Mount the directory /cgroup for centos 6.*
major_version = int(DistroVersion.split('.')[0])
if (DistroName == 'centos' or DistroName == 'redhat') and major_version < 7:
_mount_cgroup()
while True:
exe_path = os.path.join(InstallRoot, "nodemanager")
devnull = open(os.devnull, 'w')
child_process = subprocess.Popen(exe_path, stdout=devnull, stderr=devnull, cwd=InstallRoot)
if child_process.pid is None or child_process.pid < 1:
exit_msg = 'Failed to start HPC node manager process'
hutil.do_status_report('Enable', 'error', 1, exit_msg)
else:
#Sleep 1 second to check if the process is still running
time.sleep(1)
if child_process.poll() is None:
hutil.do_status_report('Enable', 'success', 0, "")
waagent.Log('HPC node manager process started')
exit_code = child_process.wait()
exit_msg = "HPC node manager process exits: {0}".format(exit_code)
hutil.do_status_report('Enable', 'warning', exit_code, exit_msg)
else:
exit_msg = "HPC node manager process crashes: {0}".format(child_process.returncode)
hutil.do_status_report('Enable', 'error', child_process.returncode, exit_msg)
waagent.Log(exit_msg)
waagent.Log("Restart HPC node manager process after {0} seconds".format(RestartIntervalInSeconds))
time.sleep(RestartIntervalInSeconds)
except Exception as e:
hutil.error("Failed to enable the extension with error: %s, stack trace: %s" %(str(e), traceback.format_exc()))
hutil.do_exit(1, 'Enable','error','1', 'Enable failed.')
def uninstall():
hutil = parse_context('Uninstall')
_uninstall_nodemanager_files()
cleanup_host_entries()
if os.path.isfile('/etc/systemd/system/hpccgroot.service'):
waagent.Run("systemctl stop hpccgroot.service")
waagent.Run("systemctl disable hpccgroot.service")
os.remove('/etc/systemd/system/hpccgroot.service')
waagent.Run("systemctl reset-failed")
waagent.Run("systemctl daemon-reload")
hutil.do_exit(0,'Uninstall','success','0', 'Uninstall succeeded')
def disable():
hutil = parse_context('Disable')
#Check whether monitor process is running.
#If it does, kill it. Otherwise clear pid file
if os.path.isfile(DaemonPidFilePath):
pid = waagent.GetFileContents(DaemonPidFilePath)
if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid):
waagent.Log(("Stop HPC node manager daemon: {0}").format(pid))
os.killpg(int(pid), 9)
os.remove(DaemonPidFilePath)
cleanup_host_entries()
hutil.do_exit(0, 'Disable', 'success', '0',
'HPC node manager daemon is disabled')
os.remove(DaemonPidFilePath)
hutil.do_exit(0, 'Disable', 'success', '0',
'HPC node manager daemon is not running')
def update():
hutil = parse_context('Update')
cleanup_host_entries()
configfile = os.path.join(InstallRoot, 'nodemanager.json')
if os.path.isfile(configfile):
waagent.Log("Update extension: backup the nodemanager configuration file.")
shutil.copy2(configfile, os.getcwd())
# A fix only for SUSE Linux that sometimes the hostname got changed because out-of-date host/IP entry in /etc/hosts
# It may happen when the node was assigned a different IP after deallocation
if DistroName == 'suse':
confighostname = gethostname_from_configfile(configfile)
if confighostname:
curhostname = socket.gethostname().split('.')[0]
if confighostname.lower() != curhostname.lower():
waagent.Log("Update: Set the hostname from {0} to {1}".format(curhostname, confighostname))
osutil.set_hostname(confighostname)
osutil.publish_hostname(confighostname)
hutil.do_exit(0,'Update','success','0', 'Update Succeeded')
def get_python_executor():
cmd = ''
if sys.version_info.major == 2:
cmd = 'python2'
elif sys.version_info.major == 3:
cmd = 'python3'
if waagent.Run("command -v {0}".format(cmd), chk_err=False) != 0:
# If a user-installed python isn't available, check for a platform-python. This is typically only used in RHEL 8.0.
if waagent.Run("command -v /usr/libexec/platform-python", chk_err=False) == 0:
cmd = '/usr/libexec/platform-python'
return cmd
def get_dist_info():
try:
return get_distro()
except:
pass
errCode, info = waagent.RunGetOutput("cat /etc/*-release")
if errCode != 0:
raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode))
distroName = ''
distroVersion = ''
for line in info.splitlines():
if line.startswith('PRETTY_NAME='):
line = line.lower()
if 'ubuntu' in line:
distroName = 'ubuntu'
elif 'centos' in line:
distroName = 'centos'
elif 'red hat' in line:
distroName = 'redhat'
elif 'suse' in line:
distroName = 'suse'
elif 'alma' in line:
distroName = 'alma'
elif 'rocky' in line:
distroName = 'rocky'
elif 'fedora' in line:
distroName = 'fedora'
elif 'freebsd' in line:
distroName = 'freebsd'
else:
raise Exception('Unknown linux distribution with {}'.format(line))
if line.startswith('VERSION_ID='):
line = line.strip(' ')
quoteIndex = line.index('"')
if quoteIndex >= 0:
distroVersion = line[quoteIndex+1:-1]
return distroName, distroVersion, ""
if __name__ == '__main__' :
main()