#!/usr/bin/env python
#
# HPCNodeManager extension
#
# Copyright 2015 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import json
import subprocess
import re
import time
import traceback
import socket
import shutil
import platform
import struct
import array
import fcntl
import hashlib

from Utils.WAAgentUtil import waagent
import Utils.HandlerUtil as Util
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.version import get_distro

#Define global variables
ExtensionShortName = 'HPCNodeManager'
DaemonPidFilePath = '/var/run/hpcnmdaemon.pid'
InstallRoot = '/opt/hpcnodemanager'
DistroName = None
DistroVersion = None
CGroupV2 = False
RestartIntervalInSeconds = 60
osutil = None

def main():
    waagent.LoggerInit('/var/log/waagent.log','/dev/stdout')
    waagent.Log('Microsoft.HpcPack Linux NodeAgent started to handle.')
    global DistroName, DistroVersion, osutil, CGroupV2
    distro = get_dist_info()
    DistroName = distro[0].lower()
    # waagent common lib returns 'rhel' than 'redhat' since 9, rename to 'redhat' here to minimize the change needed
    if DistroName == 'rhel':
        DistroName = 'redhat'
    DistroVersion = distro[1]
    CGroupV2 = os.path.exists("/sys/fs/cgroup/cgroup.controllers")
    osutil = get_osutil()
    for a in sys.argv[1:]:        
        if re.match("^([-/]*)(disable)", a):
            disable()
        elif re.match("^([-/]*)(uninstall)", a):
            uninstall()
        elif re.match("^([-/]*)(install)", a):
            install()
        elif re.match("^([-/]*)(enable)", a):
            enable()
        elif re.match("^([-/]*)(daemon)", a):
            daemon()            
        elif re.match("^([-/]*)(update)", a):
            update()

def _is_nodemanager_daemon(pid):
    retcode, output = waagent.RunGetOutput("ps -p {0} -o cmd=".format(pid))
    if retcode == 0:
        waagent.Log("The cmd for process {0} is {1}".format(pid, output))
        pattern = r'(.*[/\s])?{0}\s+[-/]*daemon$'.format(os.path.basename(__file__))
        if re.match(pattern, output):
            return True
    waagent.Log("The process {0} is not HPC Linux node manager daemon".format(pid))
    return False

def install_package(package_name):
    if DistroName in ["centos", "redhat", "almalinux", "rocky"]:
        cmd = "yum -y install " + package_name
    elif DistroName == "ubuntu":
        waagent.Log("Updating apt package lists with command: apt-get -y update")
        exitcode = waagent.Run("apt-get -y update", chk_err=False)
        if exitcode != 0:
            waagent.Log("Update apt package lists failed with exitcode: {0}".format(exitcode))
        cmd = "apt-get -y install " + package_name
    elif DistroName == "suse":
        if not os.listdir('/etc/zypp/repos.d'):
            waagent.Run("zypper ar http://download.opensuse.org/distribution/13.2/repo/oss/suse/ opensuse")
            cmd = "zypper -n --gpg-auto-import-keys install --force-resolution -l " + package_name
        else:
            cmd = "zypper -n install --force-resolution -l " + package_name
    else:
        raise Exception("Unsupported Linux Distro.")
    waagent.Log("The command to install {0}: {1}".format(package_name, cmd))
    attempt = 1
    while(True):
        waagent.Log("Installing package {0} (Attempt {1})".format(package_name, attempt))
        retcode, retoutput = waagent.RunGetOutput(cmd)
        if retcode == 0:
            waagent.Log("package {0} installation succeeded".format(package_name))
            break
        else:
            waagent.Log("package {0} installation failed {1}:\n {2}".format(package_name, retcode, retoutput))
            if attempt < 10:
                time.sleep(min(30, pow(2, attempt)))
                attempt += 1
                if DistroName == 'suse' and retcode == 104:
                    waagent.Run("zypper ar http://download.opensuse.org/distribution/13.2/repo/oss/suse/ opensuse")
                    cmd = "zypper -n --gpg-auto-import-keys install --force-resolution -l " + package_name
                elif DistroName == "ubuntu":
                    waagent.Run("apt-get -y update", chk_err=False)
                continue
            else:
                raise Exception("failed to install package {0}:{1}".format(package_name, retcode))

def _uninstall_nodemanager_files():
    if os.path.isdir(InstallRoot):
        for tmpname in os.listdir(InstallRoot):
            if tmpname == 'logs':
                continue
            if tmpname == 'certs':
                continue
            if tmpname == 'filters':
                continue
            tmppath = os.path.join(InstallRoot, tmpname)
            if os.path.isdir(tmppath):
                shutil.rmtree(tmppath)
            elif os.path.isfile(tmppath):
                os.remove(tmppath)

def _install_cgroup_tool():
    if CGroupV2:
        waagent.Log("cgroup v2 enabled, skip cgroup tools installation")
    elif waagent.Run("command -v cgexec", chk_err=False) == 0:
        waagent.Log("cgroup tools was already installed")
    else:
        waagent.Log("Start to install cgroup tools")
        if DistroName == "ubuntu":
            if re.match("^1", DistroVersion):
                cg_pkgname = 'cgroup-bin'
            else:
                cg_pkgname = 'cgroup-tools'
        elif (DistroName == "centos" or DistroName == "redhat") and re.match("^6", DistroVersion):
            cg_pkgname = 'libcgroup'
        else:
            cg_pkgname = 'libcgroup-tools'
        install_package(cg_pkgname)
        waagent.Log("cgroup tool was successfully installed")

def _install_sysstat():
    if waagent.Run("command -v iostat", chk_err=False) == 0:
        waagent.Log("sysstat was already installed")
    else:
        waagent.Log("Start to install sysstat")
        install_package('sysstat')
        waagent.Log("sysstat was successfully installed")

def _install_pstree():
    if waagent.Run("command -v pstree", chk_err=False) == 0:
        waagent.Log("pstree was already installed")
    else:
        waagent.Log("Start to install pstree")
        install_package('psmisc')
        waagent.Log("pstree was successfully installed")

def get_networkinterfaces():
    """
    Return the interface name, and ip addr of the
    all non loopback interfaces.
    """
    expected=16 # how many devices should I expect...
    is_64bits = sys.maxsize > 2**32
    struct_size=40 if is_64bits else 32 # for 64bit the size is 40 bytes, for 32bits it is 32 bytes.
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    buff=array.array('B', b'\0' * (expected*struct_size))
    retsize=(struct.unpack('iL', fcntl.ioctl(s.fileno(), 0x8912, struct.pack('iL',expected*struct_size,buff.buffer_info()[0]))))[0]
    if retsize == (expected*struct_size) :
        waagent.Log('SIOCGIFCONF returned more than ' + str(expected) + ' up network interfaces.')
    nics = []
    s=buff.tostring()
    for i in range(0,retsize,struct_size):
        iface=s[i:i+16].split(b'\0', 1)[0]
        if iface == b'lo':
            continue
        else:
            nics.append((iface.decode('latin-1'), socket.inet_ntoa(s[i+20:i+24])))
    return nics

def cleanup_host_entries():
    hostsfile = '/etc/hosts'
    if not os.path.isfile(hostsfile):
        return
    try:
        hpcentryexists = False
        newcontent=''
        with open(hostsfile, 'r') as F:
            for line in F.readlines():
                if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPCD?\s*$", line):
                    hpcentryexists = True
                else:
                    newcontent += line
        if hpcentryexists:
            waagent.Log("Clean all HPC related host entries from hosts file")
            waagent.ReplaceFileContentsAtomic(hostsfile,newcontent)
            os.chmod(hostsfile, 0o644)
    except :
        raise

def init_suse_hostsfile(host_name, ipaddrs):
    hostsfile = '/etc/hosts'
    if not os.path.isfile(hostsfile):
        return
    try:
        newhpcd_entries = ''
        for ipaddr in ipaddrs:
            newhpcd_entries += '{0:24}{1:30}#HPCD\n'.format(ipaddr, host_name)

        curhpcd_entries = ''
        newcontent = ''
        hpcentryexists = False
        with open(hostsfile, 'r') as F:
            for line in F.readlines():
                if re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPCD\s*$", line):
                    curhpcd_entries += line
                    hpcentryexists = True
                elif re.match(r"^[0-9\.]+\s+[^\s#]+\s+#HPC\s*$", line):
                    hpcentryexists = True
                else:
                    newcontent += line

        if newhpcd_entries != curhpcd_entries:
            if hpcentryexists:
                waagent.Log("Clean the HPC related host entries from hosts file")
            waagent.Log("Add the following HPCD host entries:\n{0}".format(newhpcd_entries))
            if newcontent and newcontent[-1] != '\n':
                newcontent += '\n'
            newcontent += newhpcd_entries
            waagent.ReplaceFileContentsAtomic(hostsfile,newcontent)
            os.chmod(hostsfile, 0o644)
    except :
        raise

def gethostname_from_configfile(configfile):
    config_hostname = None
    if os.path.isfile(configfile):
        with open(configfile, 'r') as F:
            configjson = json.load(F)
        if 'RegisterUri' in configjson:
            reguri = configjson['RegisterUri']
            reguri = reguri[0:reguri.rindex('/')]
            config_hostname = reguri[reguri.rindex('/')+1:]
    return config_hostname

def _add_dns_search(domain_fqdn):
    need_update = False
    new_content = ''
    for line in (open('/etc/resolv.conf','r')).readlines():
        if re.match('^search.* {0}'.format(domain_fqdn), line):
            waagent.Log('{0} was already added in /etc/resolv.conf'.format(domain_fqdn))
            return
        if re.match('^search', line):
            need_update = True
            new_content += line.replace('search', 'search {0}'.format(domain_fqdn))
        else:
            new_content += line
    if need_update:
        waagent.Log('Adding {0} to /etc/resolv.conf'.format(domain_fqdn))
        waagent.SetFileContents('/etc/resolv.conf', new_content)

def _update_dns_record(domain_fqdn):
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    while True:
        try:
            s.connect((domain_fqdn, 53))
            break
        except Exception as e:
            waagent.Log('Failed to connect to {0}:53: {1}'.format(domain_fqdn, e))
    ipaddr = s.getsockname()[0]
    host_fqdn = "{0}.{1}".format(socket.gethostname().split('.')[0], domain_fqdn)
    dns_cmd = 'echo -e "server {0}\nzone {0}\nupdate delete {1}\nupdate add {1} 864000 A {2}\nsend\n" | nsupdate -v'.format(domain_fqdn, host_fqdn, ipaddr)
    waagent.Log('The command to update ip to dns server is: {0}'.format(dns_cmd))
    retry = 0
    while retry < 60:
        dns_ret, dns_msg = waagent.RunGetOutput(dns_cmd)
        if dns_ret == 0:
            waagent.Log("Succeeded to update ip to dns server.")
            return
        else:
            retry = retry + 1
            waagent.Log("Failed to update ip to dns server: {0}, {1}".format(dns_ret, dns_msg))
            time.sleep(10)

def _mount_cgroup():
    if not os.path.isdir('/cgroup'):
        os.mkdir('/cgroup')
    if not os.listdir('/cgroup'):
        retcode, mount_msg = waagent.RunGetOutput('mount -t cgroup cgroup /cgroup')
        waagent.Log("mount /cgroup directory {0}:{1}".format(retcode, mount_msg))
        if retcode == 0:
            waagent.Log("/cgroup directory is successfully mounted.")
        else:
            raise Exception("failed to mount /cgroup directory")
    else:
        waagent.Log("/cgroup directory was already mounted.")

def config_firewall_rules():
    if DistroName == 'redhat':
        waagent.Log('Configuring the firewall rules')
        major_version = int(DistroVersion.split('.')[0])
        if major_version < 7:
            waagent.Run('lokkit --port=40000:tcp --update', chk_err=False)
            waagent.Run('lokkit --port=40002:tcp --update', chk_err=False)
        elif waagent.Run("firewall-cmd --state", chk_err=False) == 0:
            waagent.Run("firewall-cmd --permanent --zone=public --add-port=40000/tcp")
            waagent.Run("firewall-cmd --permanent --zone=public --add-port=40002/tcp")
            waagent.Run("firewall-cmd --reload")

def parse_context(operation):
    hutil = Util.HandlerUtility(waagent.Log, waagent.Error, ExtensionShortName)
    hutil.do_parse_context(operation)
    return hutil


def cmpFileHash(file1, file2):
    if not (os.path.isfile(file1) and os.path.isfile(file2)):
        return False
    digests = []
    for filename in [file1, file2]:
        md5hash = hashlib.md5()
        with open(filename, 'rb') as f:
            buf = f.read()
            md5hash.update(buf)
            digest = md5hash.hexdigest()
            digests.append(digest)
    return digests[0] == digests[1]


def install():
    hutil = parse_context('Install')
    try:
        cleanup_host_entries()
        _uninstall_nodemanager_files()
        if DistroName in ["centos", "redhat", "almalinux", "rocky"]:
            waagent.Run("yum-config-manager --setopt=\\*.skip_if_unavailable=1 --save", chk_err=False)
        _install_cgroup_tool()
        _install_sysstat()
        _install_pstree()
        
        logDir = os.path.join(InstallRoot, "logs")
        if not os.path.isdir(logDir):
            os.makedirs(logDir)
        srcDir = os.path.join(os.getcwd(), "bin")
        waagent.RunGetOutput("chmod +x {0}/*".format(srcDir))
        waagent.RunGetOutput("chmod +x {0}/lib/*".format(srcDir))
        for filename in os.listdir(srcDir):
            srcname = os.path.join(srcDir, filename)
            destname = os.path.join(InstallRoot, filename)
            if os.path.isfile(srcname):
                shutil.copy2(srcname, destname)
            elif os.path.isdir(srcname):
                shutil.copytree(srcname, destname)
        libdir = os.path.join(InstallRoot, 'lib')
        for tmpname in os.listdir(libdir):
            tmppath = os.path.join(libdir, tmpname)
            if tmpname.endswith(".tar.gz") and os.path.isfile(tmppath):
                waagent.Run("tar xzvf {0} -C {1}".format(tmppath, libdir))
                os.remove(tmppath)
        waagent.Run("chmod -R 755 {0}".format(libdir))

        host_name = None
        public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
        if public_settings:
            host_name = public_settings.get('HostName')
        backup_configfile = os.path.join(os.getcwd(), 'nodemanager.json')
        if not host_name:
            # if there is backup nodemanager.json, means it is an update install, if 'HostName' not defined in the extension
            # settings, we shall get from the backup nodemanager.json
            if os.path.isfile(backup_configfile):
                waagent.Log("Backup nodemanager configuration file found")
                host_name = gethostname_from_configfile(backup_configfile)

        curhostname = socket.gethostname().split('.')[0]
        if host_name:
            if host_name.lower() != curhostname.lower():
                waagent.Log("HostName was set: hostname from {0} to {1}".format(curhostname, host_name))
                osutil.set_hostname(host_name)
                osutil.publish_hostname(host_name)
        else:
            host_name = curhostname
        public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
        protect_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('protectedSettings')
        authentication_key = ""
        if protect_settings is not None:
            authentication_key = protect_settings.get('AuthenticationKey')
            authentication_key = authentication_key if authentication_key is not None else ""
        cluster_connstring = public_settings.get('ClusterConnectionString')
        if not cluster_connstring:
            waagent.Log("ClusterConnectionString is not specified")
            cluster_connstring = protect_settings.get('ClusterName')
            if not cluster_connstring:
                error_msg = "neither ClusterConnectionString nor ClusterName is specified."
                hutil.error(error_msg)
                raise ValueError(error_msg)
        ssl_thumbprint = public_settings.get('SSLThumbprint')
        certsdir = os.path.join(InstallRoot, "certs")
        if not ssl_thumbprint:
            api_prefix = "http://{0}:80/HpcLinux/api/"
            listen_uri = "http://0.0.0.0:40000"
        else:
            api_prefix = "https://{0}:443/HpcLinux/api/"
            listen_uri = "https://0.0.0.0:40002"
            # import the ssl certificate for hpc nodemanager
            if not os.path.isdir(certsdir):
                os.makedirs(certsdir, 0o750)
            else:
                os.chmod(certsdir, 0o750)
            ssl_thumbprint = ssl_thumbprint.upper()
            prvfile = os.path.join("/var/lib/waagent", ssl_thumbprint + ".prv")
            srccrtfile = os.path.join("/var/lib/waagent", ssl_thumbprint + ".crt")
            rsakeyfile = os.path.join(certsdir, "nodemanager_rsa.key")
            dstcrtfile = os.path.join(certsdir, "nodemanager.crt")
            if os.path.isfile(prvfile) and not cmpFileHash(prvfile, rsakeyfile):
                waagent.Run("rm -rf {0}/nodemanager.crt {0}/nodemanager.key {0}/nodemanager.pem {0}/nodemanager_rsa.key".format(certsdir), chk_err=False)
                shutil.copy2(prvfile, rsakeyfile)
                shutil.copy2(srccrtfile, dstcrtfile)
                shutil.copy2(dstcrtfile, os.path.join(certsdir, "nodemanager.pem"))
                waagent.Run("openssl rsa -in {0}/nodemanager_rsa.key -out {0}/nodemanager.key".format(certsdir))
                waagent.Run("chmod 640 {0}/nodemanager.crt {0}/nodemanager.key {0}/nodemanager.pem {0}/nodemanager_rsa.key".format(certsdir))

        node_uri = api_prefix + host_name + "/computenodereported"
        reg_uri = api_prefix + host_name + "/registerrequested"
        hostsfile_uri = api_prefix + "hostsfile"
        metric_ids_uri = api_prefix + host_name + "/getinstanceids"
        namingSvcUris = ['https://{0}:443/HpcNaming/api/fabric/resolve/singleton/'.format(h.split('.')[0].strip()) for h in cluster_connstring.split(',')]
        if os.path.isfile(backup_configfile):
            with open(backup_configfile, 'r') as F:
                configjson = json.load(F)
            configjson["NamingServiceUri"] = namingSvcUris
            configjson["HeartbeatUri"] = node_uri
            configjson["RegisterUri"] = reg_uri
            configjson["HostsFileUri"] = hostsfile_uri
            configjson["MetricInstanceIdsUri"] = metric_ids_uri
            configjson["MetricUri"] = ""
            configjson["ListeningUri"] = listen_uri
        else:
            configjson = {
              "ConfigVersion": "1.0",
              "NamingServiceUri": namingSvcUris,
              "HeartbeatUri": node_uri,
              "RegisterUri": reg_uri,
              "MetricUri": "",
              "MetricInstanceIdsUri": metric_ids_uri,
              "HostsFileUri": hostsfile_uri,
              "HostsFetchInterval": 120,
              "ListeningUri": listen_uri,
              "DefaultServiceName": "SchedulerStatefulService",
              "UdpMetricServiceName": "MonitoringStatefulService"
            }
        if ssl_thumbprint:
            configjson["TrustedCAFile"] = os.path.join(certsdir, "nodemanager.pem")
            configjson["CertificateChainFile"] = os.path.join(certsdir, "nodemanager.crt")
            configjson["PrivateKeyFile"] = os.path.join(certsdir, "nodemanager.key")
        if authentication_key:
            configjson["ClusterAuthenticationKey"] = authentication_key
        configfile = os.path.join(InstallRoot, 'nodemanager.json')
        waagent.SetFileContents(configfile, json.dumps(configjson))
        shutil.copy2(configfile, backup_configfile)
        config_firewall_rules()
        if CGroupV2:
            shutil.copy2(os.path.join(InstallRoot, "hpccgroot.service"), "/etc/systemd/system/hpccgroot.service")
            waagent.Run("systemctl daemon-reload")
            waagent.Run("systemctl enable hpccgroot.service")
            waagent.Run("systemctl restart hpccgroot.service")
        hutil.do_exit(0, 'Install', 'success', '0', 'Install Succeeded.')
    except Exception as e:
        hutil.do_exit(1, 'Install','error','1', '{0}'.format(e))

def enable():
    #Always restart daemon and clear PID file
    hutil = parse_context('Enable')
    if os.path.isfile(DaemonPidFilePath):
        pid = waagent.GetFileContents(DaemonPidFilePath)
        if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid):
            waagent.Log("Stop old daemon: {0}".format(pid))
            os.killpg(int(pid), 9)
        os.remove(DaemonPidFilePath)

    args = [get_python_executor(), os.path.join(os.getcwd(), __file__), "daemon"]
    devnull = open(os.devnull, 'w')
    child = subprocess.Popen(args, stdout=devnull, stderr=devnull, preexec_fn=os.setsid)
    if child.pid is None or child.pid < 1:
        hutil.do_exit(1, 'Enable', 'error', '1',
                      'Failed to launch HPC Linux node manager daemon')
    else:
        hutil.save_seq()
        waagent.SetFileContents(DaemonPidFilePath, str(child.pid))
        #Sleep 3 seconds to check if the process is still running
        time.sleep(3)
        if child.poll() is None:
            waagent.Log("Daemon pid: {0}".format(child.pid))
            hutil.do_exit(0, 'Enable', 'success', '0',
                      'HPC Linux node manager daemon is enabled')
        else:
            hutil.do_exit(1, 'Enable', 'error', '2',
                      'Failed to launch HPC Linux node manager daemon')

def daemon():
    hutil = parse_context('Enable')

    if CGroupV2:
        service_dir = '/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service'
        if not os.path.exists(service_dir):
            os.makedirs(service_dir)
        with open('/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.procs', 'r') as f:
            pids = f.read().splitlines()
        for pid in pids:
            waagent.Run("echo {0} > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service/cgroup.procs".format(pid))
        waagent.Run('echo "+cpu +cpuset +memory" > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.subtree_control')

    try:
        public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
        domain_fqdn = public_settings.get('DomainName')
        if not domain_fqdn:
            cluster_connstring = public_settings.get('ClusterConnectionString')
            if not cluster_connstring:
                waagent.Log("ClusterConnectionString is not specified, use ClusterName instead")
                protect_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('protectedSettings')
                cluster_connstring = protect_settings.get('ClusterName')
            headnode_name = cluster_connstring.split(',')[0].strip()
            if headnode_name.find('.') > 0:
                # The head node name is FQDN, extract the domain FQDN
                domain_fqdn = headnode_name.split(".", 1)[1]

        if domain_fqdn:
            waagent.Log("The domain FQDN is " + domain_fqdn)
            _add_dns_search(domain_fqdn)
            #thread.start_new_thread(_update_dns_record, (domain_fqdn,))

        # A fix only for SUSE Linux that sometimes the hostname got changed because out-of-date host/IP entry in /etc/hosts
        # It may happen when the node was assigned a different IP after deallocation
        # We shall clean the current HPC related host/IP entries and add the actual IPs before fetching the hosts file from head node.
        if DistroName == 'suse':
            configfile = os.path.join(InstallRoot, 'nodemanager.json')
            confighostname = gethostname_from_configfile(configfile)
            curhostname = socket.gethostname().split('.')[0]
            if confighostname.lower() != curhostname.lower():
                cleanup_host_entries()
                waagent.Log("Correct the hostname from {0} to {1}".format(curhostname, confighostname))
                osutil.set_hostname(confighostname)
                osutil.publish_hostname(confighostname)
            retry = 0
            while True:
                nics = get_networkinterfaces()
                if len(nics) > 0:
                    init_suse_hostsfile(confighostname, [nic[1] for nic in nics])
                    break
                elif retry < 30:
                    waagent.Log("Failed to get network interfaces information, retry later ...")
                    time.sleep(2)
                    retry = retry + 1
                else:
                    waagent.Log("Failed to get network interfaces information, just clean")
                    break
        # Mount the directory /cgroup for centos 6.*
        major_version = int(DistroVersion.split('.')[0])
        if (DistroName == 'centos' or DistroName == 'redhat') and major_version < 7:
            _mount_cgroup()
        while True:
            exe_path = os.path.join(InstallRoot, "nodemanager")
            devnull = open(os.devnull, 'w')
            child_process = subprocess.Popen(exe_path, stdout=devnull, stderr=devnull, cwd=InstallRoot)
            if child_process.pid is None or child_process.pid < 1:
                exit_msg = 'Failed to start HPC node manager process'
                hutil.do_status_report('Enable', 'error', 1, exit_msg)
            else:
                #Sleep 1 second to check if the process is still running
                time.sleep(1)
                if child_process.poll() is None:
                    hutil.do_status_report('Enable', 'success', 0, "")
                    waagent.Log('HPC node manager process started')
                    exit_code = child_process.wait()
                    exit_msg = "HPC node manager process exits: {0}".format(exit_code)
                    hutil.do_status_report('Enable', 'warning', exit_code, exit_msg)
                else:
                    exit_msg = "HPC node manager process crashes: {0}".format(child_process.returncode)
                    hutil.do_status_report('Enable', 'error', child_process.returncode, exit_msg)
            waagent.Log(exit_msg)
            waagent.Log("Restart HPC node manager process after {0} seconds".format(RestartIntervalInSeconds))
            time.sleep(RestartIntervalInSeconds)

    except Exception as e:
        hutil.error("Failed to enable the extension with error: %s, stack trace: %s" %(str(e), traceback.format_exc()))
        hutil.do_exit(1, 'Enable','error','1', 'Enable failed.')

def uninstall():
    hutil = parse_context('Uninstall')
    _uninstall_nodemanager_files()
    cleanup_host_entries()
    if os.path.isfile('/etc/systemd/system/hpccgroot.service'):
        waagent.Run("systemctl stop hpccgroot.service")
        waagent.Run("systemctl disable hpccgroot.service")
        os.remove('/etc/systemd/system/hpccgroot.service')
        waagent.Run("systemctl reset-failed")
        waagent.Run("systemctl daemon-reload")
    hutil.do_exit(0,'Uninstall','success','0', 'Uninstall succeeded')

def disable():
    hutil = parse_context('Disable')
    #Check whether monitor process is running.
    #If it does, kill it. Otherwise clear pid file
    if os.path.isfile(DaemonPidFilePath):
        pid = waagent.GetFileContents(DaemonPidFilePath)
        if os.path.isdir(os.path.join("/proc", pid)) and _is_nodemanager_daemon(pid):
            waagent.Log(("Stop HPC node manager daemon: {0}").format(pid))
            os.killpg(int(pid), 9)
            os.remove(DaemonPidFilePath)
            cleanup_host_entries()
            hutil.do_exit(0, 'Disable', 'success', '0',
                          'HPC node manager daemon is disabled')
        os.remove(DaemonPidFilePath)

    hutil.do_exit(0, 'Disable', 'success', '0',
                  'HPC node manager daemon is not running')

def update():
    hutil = parse_context('Update')
    cleanup_host_entries()
    configfile = os.path.join(InstallRoot, 'nodemanager.json')
    if os.path.isfile(configfile):
        waagent.Log("Update extension: backup the nodemanager configuration file.")
        shutil.copy2(configfile, os.getcwd())
        # A fix only for SUSE Linux that sometimes the hostname got changed because out-of-date host/IP entry in /etc/hosts
        # It may happen when the node was assigned a different IP after deallocation
        if DistroName == 'suse':
            confighostname = gethostname_from_configfile(configfile)
            if confighostname:
                curhostname = socket.gethostname().split('.')[0]
                if confighostname.lower() != curhostname.lower():
                    waagent.Log("Update: Set the hostname from {0} to {1}".format(curhostname, confighostname))
                    osutil.set_hostname(confighostname)
                    osutil.publish_hostname(confighostname)
    hutil.do_exit(0,'Update','success','0', 'Update Succeeded')

def get_python_executor():
    cmd = ''
    if sys.version_info.major == 2:
        cmd = 'python2'
    elif sys.version_info.major == 3:
        cmd = 'python3'
    if waagent.Run("command -v {0}".format(cmd), chk_err=False) != 0:
        # If a user-installed python isn't available, check for a platform-python. This is typically only used in RHEL 8.0.
        if waagent.Run("command -v /usr/libexec/platform-python", chk_err=False) == 0:
            cmd = '/usr/libexec/platform-python'
    return cmd

def get_dist_info():
    try:
        return get_distro()
    except:
        pass
    errCode, info = waagent.RunGetOutput("cat /etc/*-release")
    if errCode != 0:
        raise Exception('Failed to get Linux Distro info by running command "cat /etc/*release", error code: {}'.format(errCode))
    distroName = ''
    distroVersion = ''
    for line in info.splitlines():
        if line.startswith('PRETTY_NAME='):
            line = line.lower()
            if 'ubuntu' in line:
                distroName = 'ubuntu'
            elif 'centos' in line:
                distroName = 'centos'
            elif 'red hat' in line:
                distroName = 'redhat'
            elif 'suse' in line:
                distroName = 'suse'
            elif 'alma' in line:
                distroName = 'alma'
            elif 'rocky' in line:
                distroName = 'rocky'
            elif 'fedora' in line:
                distroName = 'fedora'
            elif 'freebsd' in line:
                distroName = 'freebsd'
            else:
                raise Exception('Unknown linux distribution with {}'.format(line))
        if line.startswith('VERSION_ID='):
            line = line.strip(' ')
            quoteIndex = line.index('"')
            if quoteIndex >= 0:
                distroVersion = line[quoteIndex+1:-1]
    return distroName, distroVersion, ""

  
if __name__ == '__main__' :
    main()

