in src/slurm_plugin/capacity_block_manager.py [0:0]
def get_reserved_nodenames(self, nodes: List[SlurmNode]):
"""Manage nodes part of capacity block reservation. Returns list of reserved nodes."""
try:
# evaluate if it's the moment to update info
now = datetime.now(tz=timezone.utc)
if self._is_time_to_update(now):
reserved_nodenames = []
self._slurm_reservation_update_errors = 0
# find an updated list of capacity blocks from fleet config
capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config()
if capacity_blocks:
# update capacity blocks details from ec2 (e.g. state)
self._update_capacity_blocks_info_from_ec2(capacity_blocks)
# associate nodenames to capacity blocks,
# according to queues and compute resources from fleet configuration
self._associate_nodenames_to_capacity_blocks(capacity_blocks, nodes)
# create, update or delete slurm reservation for the nodes according to CB details.
for capacity_block in capacity_blocks.values():
slurm_reservation_updated = self._update_slurm_reservation(
capacity_block=capacity_block, do_update=not self._is_initialized()
)
if not slurm_reservation_updated:
self._slurm_reservation_update_errors += 1
# If CB is in not yet active or expired add nodes to list of reserved nodes,
# only if slurm reservation has been correctly created/updated
if slurm_reservation_updated and not capacity_block.is_active():
reserved_nodenames.extend(capacity_block.nodenames())
# If all Slurm reservation actions failed do not update object attributes
if (
self._slurm_reservation_update_errors != len(capacity_blocks)
or self._slurm_reservation_update_errors == 0
):
# Once all the steps have been successful, update object attributes
self._capacity_blocks = capacity_blocks
self._capacity_blocks_update_time = now
self._reserved_nodenames = reserved_nodenames
# delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks
self._cleanup_leftover_slurm_reservations()
except (SlurmCommandError, CapacityBlockManagerError) as e:
logger.error(
"Unable to retrieve list of reserved nodes, maintaining old list: %s. %s",
self._reserved_nodenames,
e,
)
except Exception as e:
logger.error(
"Unexpected error. Unable to retrieve list of reserved nodes, maintaining old list: %s. %s",
self._reserved_nodenames,
e,
)
return self._reserved_nodenames