in src/slurm_plugin/suspend.py [0:0]
def main():
default_log_file = "/var/log/parallelcluster/slurm_suspend.log"
logging.basicConfig(
filename=default_log_file,
level=logging.INFO,
format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s",
)
log.info("SuspendProgram startup.")
parser = argparse.ArgumentParser()
parser.add_argument("nodes", help="Nodes to release")
args = parser.parse_args()
config_file = os.environ.get("CONFIG_FILE", os.path.join(CONFIG_FILE_DIR, "parallelcluster_slurm_suspend.conf"))
suspend_config = SlurmSuspendConfig(config_file)
try:
# Configure root logger
fileConfig(suspend_config.logging_config, disable_existing_loggers=False)
except Exception as e:
log.warning(
"Unable to configure logging from %s, using default settings and writing to %s.\nException: %s",
suspend_config.logging_config,
default_log_file,
e,
)
log.info("Suspending following nodes. Clustermgtd will cleanup orphaned instances: %s", args.nodes)
current_time = datetime.now(tz=timezone.utc)
if not is_clustermgtd_heartbeat_valid(
current_time, suspend_config.clustermgtd_timeout, suspend_config.clustermgtd_heartbeat_file_path
):
log.error(
"No valid clustermgtd heartbeat detected, clustermgtd is down! "
"Please check clustermgtd log for error.\n"
"Nodes will be reset to POWER_SAVE state after SuspendTimeout. "
"The backing EC2 instances may not be correctly terminated.\n"
"Please check and terminate any orphaned instances in EC2!"
)
else:
log.info("SuspendProgram finished. Nodes will be available after SuspendTimeout")