in src/slurm_plugin/resume.py [0:0]
def _get_config(self, config_file_path):
"""Get resume program configuration."""
log.info("Reading %s", config_file_path)
config = ConfigParser()
try:
with open(config_file_path, "r") as config_file:
config.read_file(config_file)
except IOError:
log.error("Cannot read slurm cloud bursting scripts configuration file: %s", config_file_path)
raise
self.region = config.get("slurm_resume", "region")
self.cluster_name = config.get("slurm_resume", "cluster_name")
self.dynamodb_table = config.get("slurm_resume", "dynamodb_table")
self.hosted_zone = config.get("slurm_resume", "hosted_zone", fallback=self.DEFAULTS.get("hosted_zone"))
self.dns_domain = config.get("slurm_resume", "dns_domain", fallback=self.DEFAULTS.get("dns_domain"))
self.use_private_hostname = config.getboolean(
"slurm_resume", "use_private_hostname", fallback=self.DEFAULTS.get("use_private_hostname")
)
self.head_node_private_ip = config.get("slurm_resume", "head_node_private_ip")
self.head_node_hostname = config.get("slurm_resume", "head_node_hostname")
self.launch_max_batch_size = config.getint(
"slurm_resume", "launch_max_batch_size", fallback=self.DEFAULTS.get("launch_max_batch_size")
)
self.assign_node_max_batch_size = config.getint(
"slurm_resume", "assign_node_max_batch_size", fallback=self.DEFAULTS.get("assign_node_max_batch_size")
)
self.terminate_max_batch_size = config.getint(
"slurm_resume", "terminate_max_batch_size", fallback=self.DEFAULTS.get("terminate_max_batch_size")
)
self.update_node_address = config.getboolean(
"slurm_resume", "update_node_address", fallback=self.DEFAULTS.get("update_node_address")
)
self.scaling_strategy = config.get(
"slurm_resume", "scaling_strategy", fallback=self.DEFAULTS.get("scaling_strategy")
) # TODO: Check if it's a valid scaling strategy before calling expensive downstream APIs
self.job_level_scaling = config.getboolean(
"slurm_resume", "job_level_scaling", fallback=self.DEFAULTS.get("job_level_scaling")
)
fleet_config_file = config.get(
"slurm_resume", "fleet_config_file", fallback=self.DEFAULTS.get("fleet_config_file")
)
self.fleet_config = read_json(fleet_config_file)
# run_instances_overrides_file and create_fleet_overrides_file contain a json with the following format:
# {
# "queue_name": {
# "compute_resource_name": {
# <arbitrary-json-with-boto3-api-params-to-override>
# },
# ...
# },
# ...
# }
run_instances_overrides_file = config.get(
"slurm_resume", "run_instances_overrides", fallback=self.DEFAULTS.get("run_instances_overrides")
)
self.run_instances_overrides = read_json(run_instances_overrides_file, default={})
create_fleet_overrides_file = config.get(
"slurm_resume", "create_fleet_overrides", fallback=self.DEFAULTS.get("create_fleet_overrides")
)
self.create_fleet_overrides = read_json(create_fleet_overrides_file, default={})
self.clustermgtd_timeout = config.getint(
"slurm_resume",
"clustermgtd_timeout",
fallback=self.DEFAULTS.get("clustermgtd_timeout"),
)
self.clustermgtd_heartbeat_file_path = config.get("slurm_resume", "clustermgtd_heartbeat_file_path")
# Configure boto3 to retry 1 times by default
self._boto3_retry = config.getint("slurm_resume", "boto3_retry", fallback=self.DEFAULTS.get("max_retry"))
self._boto3_config = {"retries": {"max_attempts": self._boto3_retry, "mode": "standard"}}
proxy = config.get("slurm_resume", "proxy", fallback=self.DEFAULTS.get("proxy"))
if proxy != "NONE":
self._boto3_config["proxies"] = {"https": proxy}
self.boto3_config = Config(**self._boto3_config)
self.logging_config = config.get("slurm_resume", "logging_config", fallback=self.DEFAULTS.get("logging_config"))
self.head_node_instance_id = config.get("slurm_resume", "instance_id", fallback="unknown")
log.debug(self.__repr__())