def get_hostnames()

in azure-slurm/slurmcc/topology.py [0:0]


    def get_hostnames(self) -> None:
        """
        Validates partition and retrieves a list of hostnames from the SLURM scheduler based on the provided parition.
        It also checks for hosts that are not idle and powered on and filters them out.
            None
        Raises:
            SystemExit: If the number of valid and powered-on hosts is less than 2.
        Logs:
            Warnings for invalid or powered-down nodes.
            Debug information for the list of hosts and the filtered valid hosts.
            Error if the number of valid and powered-on hosts is less than 2.
        """
        def validate_partition(partition) -> None:
            try:
                output=slutil.run("sinfo -o %P | tr -d '*'", shell=True)
            except subprocesslib.CalledProcessError:
                sys.exit(1)
            except subprocesslib.TimeoutExpired:
                sys.exit(1)
            partitions=set(output.stdout.strip('*').split('\n')[1:-1])
            log.debug("Valid Partitions: %s", partitions)
            if partition not in partitions:
                log.error("Partition %s does not exist", partition)
                sys.exit(1)
            else:
                log.debug("Partition %s exists", partition)
        def get_hostlist(cmd) -> list:
            try:
                output=slutil.run(cmd, shell=True)
            except subprocesslib.CalledProcessError:
                sys.exit(1)
            except subprocesslib.TimeoutExpired:
                sys.exit(1)
            return set(output.stdout.split('\n')[:-1])
        validate_partition(self.partition)
        partition_cmd = f'-p {self.partition} '
        host_cmd = f'scontrol show hostnames $(sinfo -p {self.partition} -o "%N" -h)'
        partition_states = "powered_down,powering_up,powering_down,power_down,drain,drained,draining,unknown,down,no_respond,fail,reboot"
        sinfo_cmd = f'sinfo {partition_cmd}-t {partition_states} -o "%N" -h'
        down_cmd = f'scontrol show hostnames $({sinfo_cmd})'
        hosts=get_hostlist(host_cmd)
        down_hosts=get_hostlist(down_cmd)
        self.hosts = list(hosts-down_hosts)
        if len(self.hosts)<len(hosts):
            log.warning(
                "Some nodes were not fully powered up and idle, "
                "running on a subset of nodes that are powered on and idle"
            )
            log.warning("Excluded Nodes: %s",
                                down_hosts)
        log.debug("Original hosts: %s", hosts)
        log.debug("Powered On and Idle Hosts: %s", self.hosts)
        if len(self.hosts)<2:
            log.error(
                "Need more than 2 nodes to create slurm topology, "
                "less than 2 nodes were powered up and idle. "
            )
            sys.exit(1)