in azure-slurm/slurmcc/topology.py [0:0]
def get_hostnames(self) -> None:
"""
Validates partition and retrieves a list of hostnames from the SLURM scheduler based on the provided parition.
It also checks for hosts that are not idle and powered on and filters them out.
None
Raises:
SystemExit: If the number of valid and powered-on hosts is less than 2.
Logs:
Warnings for invalid or powered-down nodes.
Debug information for the list of hosts and the filtered valid hosts.
Error if the number of valid and powered-on hosts is less than 2.
"""
def validate_partition(partition) -> None:
try:
output=slutil.run("sinfo -o %P | tr -d '*'", shell=True)
except subprocesslib.CalledProcessError:
sys.exit(1)
except subprocesslib.TimeoutExpired:
sys.exit(1)
partitions=set(output.stdout.strip('*').split('\n')[1:-1])
log.debug("Valid Partitions: %s", partitions)
if partition not in partitions:
log.error("Partition %s does not exist", partition)
sys.exit(1)
else:
log.debug("Partition %s exists", partition)
def get_hostlist(cmd) -> list:
try:
output=slutil.run(cmd, shell=True)
except subprocesslib.CalledProcessError:
sys.exit(1)
except subprocesslib.TimeoutExpired:
sys.exit(1)
return set(output.stdout.split('\n')[:-1])
validate_partition(self.partition)
partition_cmd = f'-p {self.partition} '
host_cmd = f'scontrol show hostnames $(sinfo -p {self.partition} -o "%N" -h)'
partition_states = "powered_down,powering_up,powering_down,power_down,drain,drained,draining,unknown,down,no_respond,fail,reboot"
sinfo_cmd = f'sinfo {partition_cmd}-t {partition_states} -o "%N" -h'
down_cmd = f'scontrol show hostnames $({sinfo_cmd})'
hosts=get_hostlist(host_cmd)
down_hosts=get_hostlist(down_cmd)
self.hosts = list(hosts-down_hosts)
if len(self.hosts)<len(hosts):
log.warning(
"Some nodes were not fully powered up and idle, "
"running on a subset of nodes that are powered on and idle"
)
log.warning("Excluded Nodes: %s",
down_hosts)
log.debug("Original hosts: %s", hosts)
log.debug("Powered On and Idle Hosts: %s", self.hosts)
if len(self.hosts)<2:
log.error(
"Need more than 2 nodes to create slurm topology, "
"less than 2 nodes were powered up and idle. "
)
sys.exit(1)