in VMExtension/hpcnodemanager.py [0:0]
def daemon():
hutil = parse_context('Enable')
if CGroupV2:
service_dir = '/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service'
if not os.path.exists(service_dir):
os.makedirs(service_dir)
with open('/sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.procs', 'r') as f:
pids = f.read().splitlines()
for pid in pids:
waagent.Run("echo {0} > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/service/cgroup.procs".format(pid))
waagent.Run('echo "+cpu +cpuset +memory" > /sys/fs/cgroup/hpcpack.slice/hpccgroot.service/cgroup.subtree_control')
try:
public_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('publicSettings')
domain_fqdn = public_settings.get('DomainName')
if not domain_fqdn:
cluster_connstring = public_settings.get('ClusterConnectionString')
if not cluster_connstring:
waagent.Log("ClusterConnectionString is not specified, use ClusterName instead")
protect_settings = hutil._context._config['runtimeSettings'][0]['handlerSettings'].get('protectedSettings')
cluster_connstring = protect_settings.get('ClusterName')
headnode_name = cluster_connstring.split(',')[0].strip()
if headnode_name.find('.') > 0:
# The head node name is FQDN, extract the domain FQDN
domain_fqdn = headnode_name.split(".", 1)[1]
if domain_fqdn:
waagent.Log("The domain FQDN is " + domain_fqdn)
_add_dns_search(domain_fqdn)
#thread.start_new_thread(_update_dns_record, (domain_fqdn,))
# A fix only for SUSE Linux that sometimes the hostname got changed because out-of-date host/IP entry in /etc/hosts
# It may happen when the node was assigned a different IP after deallocation
# We shall clean the current HPC related host/IP entries and add the actual IPs before fetching the hosts file from head node.
if DistroName == 'suse':
configfile = os.path.join(InstallRoot, 'nodemanager.json')
confighostname = gethostname_from_configfile(configfile)
curhostname = socket.gethostname().split('.')[0]
if confighostname.lower() != curhostname.lower():
cleanup_host_entries()
waagent.Log("Correct the hostname from {0} to {1}".format(curhostname, confighostname))
osutil.set_hostname(confighostname)
osutil.publish_hostname(confighostname)
retry = 0
while True:
nics = get_networkinterfaces()
if len(nics) > 0:
init_suse_hostsfile(confighostname, [nic[1] for nic in nics])
break
elif retry < 30:
waagent.Log("Failed to get network interfaces information, retry later ...")
time.sleep(2)
retry = retry + 1
else:
waagent.Log("Failed to get network interfaces information, just clean")
break
# Mount the directory /cgroup for centos 6.*
major_version = int(DistroVersion.split('.')[0])
if (DistroName == 'centos' or DistroName == 'redhat') and major_version < 7:
_mount_cgroup()
while True:
exe_path = os.path.join(InstallRoot, "nodemanager")
devnull = open(os.devnull, 'w')
child_process = subprocess.Popen(exe_path, stdout=devnull, stderr=devnull, cwd=InstallRoot)
if child_process.pid is None or child_process.pid < 1:
exit_msg = 'Failed to start HPC node manager process'
hutil.do_status_report('Enable', 'error', 1, exit_msg)
else:
#Sleep 1 second to check if the process is still running
time.sleep(1)
if child_process.poll() is None:
hutil.do_status_report('Enable', 'success', 0, "")
waagent.Log('HPC node manager process started')
exit_code = child_process.wait()
exit_msg = "HPC node manager process exits: {0}".format(exit_code)
hutil.do_status_report('Enable', 'warning', exit_code, exit_msg)
else:
exit_msg = "HPC node manager process crashes: {0}".format(child_process.returncode)
hutil.do_status_report('Enable', 'error', child_process.returncode, exit_msg)
waagent.Log(exit_msg)
waagent.Log("Restart HPC node manager process after {0} seconds".format(RestartIntervalInSeconds))
time.sleep(RestartIntervalInSeconds)
except Exception as e:
hutil.error("Failed to enable the extension with error: %s, stack trace: %s" %(str(e), traceback.format_exc()))
hutil.do_exit(1, 'Enable','error','1', 'Enable failed.')