in src/slurm_plugin/clustermgtd.py [0:0]
def _perform_health_check_actions(self, partitions: List[SlurmPartition]):
"""Run health check actions."""
log.info("Performing instance health check actions")
instance_id_to_active_node_map = ClusterManager.get_instance_id_to_active_node_map(partitions)
if not instance_id_to_active_node_map:
return
# Get health states for instances that might be considered unhealthy
unhealthy_instances_status = self._instance_manager.get_unhealthy_cluster_instance_status(
list(instance_id_to_active_node_map.keys())
)
log.debug("Cluster instances that might be considered unhealthy: %s", unhealthy_instances_status)
if unhealthy_instances_status:
# Perform EC2 health check actions
if not self._config.disable_ec2_health_check:
self._handle_health_check(
unhealthy_instances_status,
instance_id_to_active_node_map,
health_check_type=ClusterManager.HealthCheckTypes.ec2_health,
)
# Perform scheduled event actions
if not self._config.disable_scheduled_event_health_check:
self._handle_health_check(
unhealthy_instances_status,
instance_id_to_active_node_map,
health_check_type=ClusterManager.HealthCheckTypes.scheduled_event,
)