def manage_cluster()

in src/slurm_plugin/clustermgtd.py [0:0]


    def manage_cluster(self):
        """Manage cluster by syncing scheduler states with EC2 states and performing node maintenance actions."""
        # Initialization
        log.info("Managing cluster...")
        self._current_time = datetime.now(tz=timezone.utc)

        self._manage_compute_fleet_status_transitions()

        if not self._config.disable_all_cluster_management and self._compute_fleet_status in {
            None,
            ComputeFleetStatus.RUNNING,
            ComputeFleetStatus.PROTECTED,
        }:
            # Get node states for nodes in inactive and active partitions
            # Initialize nodes
            try:
                log.info("Retrieving nodes info from the scheduler")
                nodes = self._get_node_info_with_retry()
                log.debug("Nodes: %s", nodes)
                partitions_name_map = self._retrieve_scheduler_partitions(nodes)
            except Exception as e:
                log.error(
                    "Unable to get partition/node info from slurm, no other action can be performed. Sleeping... "
                    "Exception: %s",
                    e,
                )
                return

            # Get all non-terminating instances in EC2
            try:
                cluster_instances = self._get_ec2_instances()
            except ClusterManager.EC2InstancesInfoUnavailable:
                log.error("Unable to get instances info from EC2, no other action can be performed. Sleeping...")
                return
            log.debug("Current cluster instances in EC2: %s", cluster_instances)
            partitions = list(partitions_name_map.values())
            self._update_slurm_nodes_with_ec2_info(nodes, cluster_instances)
            # Handle inactive partition and terminate backing instances
            self._clean_up_inactive_partition(partitions)
            # Perform health check actions
            if not self._config.disable_all_health_checks:
                self._perform_health_check_actions(partitions)
            # Maintain slurm nodes
            self._maintain_nodes(partitions_name_map)
            # Clean up orphaned instances
            self._terminate_orphaned_instances(cluster_instances)

        # Write clustermgtd heartbeat to file
        self._write_timestamp_to_file()