cloudformation/external-slurmdbd/external_slurmdbd/external_slurmdbd_stack.py (413 lines of code) (raw):
import json
import pkg_resources
from aws_cdk import Aws, CfnOutput, CfnParameter, Fn, Stack
from aws_cdk import aws_autoscaling as autoscaling
from aws_cdk import aws_ec2 as ec2
from aws_cdk import aws_iam as iam
from aws_cdk import aws_logs as logs
from aws_cdk import aws_route53 as route53
from aws_cdk import aws_s3 as s3
from constructs import Construct
def get_user_data_content(user_data_path: str):
"""Retrieve user data content."""
user_data_file_path = pkg_resources.resource_filename(__name__, user_data_path)
with open(user_data_file_path, "r", encoding="utf-8") as user_data_file:
user_data_content = user_data_file.read()
return user_data_content
EXTERNAL_SLURMDBD_ASG_SIZE = "1"
def get_assume_role_policy_document(service: str):
"""Return default service assume role policy document."""
return iam.PolicyDocument(
statements=[
iam.PolicyStatement(
actions=["sts:AssumeRole"],
effect=iam.Effect.ALLOW,
principals=[iam.ServicePrincipal(service=service)],
)
]
)
class ExternalSlurmdbdStack(Stack):
"""Create the CloudFormation stack template for External Slurmdbd."""
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
super().__init__(scope, construct_id, **kwargs)
# define networking stuff
self.vpc_id = CfnParameter(
self, "VPCId", type="String", description="The VPC to be used for the Slurmdbd stack."
)
self.subnet_id = CfnParameter(
self, "SubnetId", type="AWS::EC2::Subnet::Id", description="The Subnet to be used for the Slurmdbd stack."
)
# Define additional CloudFormation parameters for dna.json to pass to cookbook
self.dbms_uri = CfnParameter(self, "DBMSUri", type="String", description="DBMS URI for Slurmdbd.")
self.dbms_username = CfnParameter(
self, "DBMSUsername", type="String", description="DBMS Username for Slurmdbd."
)
self.dbms_password_secret_arn = CfnParameter(
self, "DBMSPasswordSecretArn", type="String", description="Secret ARN for DBMS password."
)
self.dbms_database_name = CfnParameter(
self, "DBMSDatabaseName", type="String", description="DBMS Database Name for Slurmdbd."
)
self.munge_key_secret_arn = CfnParameter(
self, "MungeKeySecretArn", type="String", description="Secret ARN for Munge key."
)
self.custom_cookbook_url_param = CfnParameter(
self, "CustomCookbookUrl", type="String", description="URL of the custom Chef Cookbook.", default=""
)
self.enable_slurm_dbd_system_service = CfnParameter(
self,
"EnableSlurmdbdSystemService",
type="String",
allowed_values=["true", "false"],
description="[Warning] It is not recommended to enable this if the database was created by a different "
"version of SlurmDBD. If the database contains a large number of entries, the SlurmDBD daemon may require "
"tens of minutes to update the database and be unresponsive during this time interval. "
"Before upgrading SlurmDBD it is strongly recommended to make a backup of the database."
"See Slurm documentation for details: https://slurm.schedmd.com/quickstart_admin.html#upgrade",
default="false",
)
# create management security group with SSH access from SSH client SG
self._ssh_server_sg, self._ssh_client_sg = self._add_management_security_groups()
# create a pair of security groups for the slurm accounting traffic across
# between cluster head node and external slurmdbd instance via port 6819
self._slurmdbd_server_sg, self._slurmdbd_client_sg = self._add_slurmdbd_accounting_security_groups()
# Create a CloudWatch log group
self._log_group = self._add_cloudwatch_log_group()
# add S3 bucket to store the slurmdbd configuration
self.s3_bucket = self._add_s3_bucket()
# define IAM role and necessary IAM policies
self._role = self._add_iam_role()
self._instance_profile = self._add_instance_profile(self._role.ref, "ExternalSlurmdbdInstanceProfile")
# create Launch Template
# This defines the dna.json and so it depends on many of the previous steps.
# It should be launched just before the ASG that should be the last before the outputs.
self._launch_template = self._add_external_slurmdbd_launch_template()
# define EC2 Auto Scaling Group (ASG)
self._external_slurmdbd_asg = self._add_external_slurmdbd_auto_scaling_group()
# define Primary Slurmdbd Instance (not via ASG)
# self._primary_slurmdbd_instance = self._add_slurmdbd_primary_instance()
# define external slurmdbd hosted zone
# self._hosted_zone = self._add_hosted_zone()
# Add DNS record to hosted zone
# self._add_instance_to_dns(
# ip_addr=self._primary_slurmdbd_instance.attr_private_ip,
# name="slurmdbd",
# )
self._add_outputs()
def _add_cfn_init_config(self):
dna_json_content = {
"slurmdbd_ip": self.slurmdbd_private_ip.value_as_string,
"slurmdbd_port": self.slurmdbd_port.value_as_number,
"dbms_uri": self.dbms_uri.value_as_string,
"dbms_username": self.dbms_username.value_as_string,
"dbms_database_name": self.dbms_database_name.value_as_string,
"dbms_password_secret_arn": self.dbms_password_secret_arn.value_as_string,
"munge_key_secret_arn": self.munge_key_secret_arn.value_as_string,
"slurmdbd_conf_bucket": self.s3_bucket.ref,
"cluster": {
"region": self.region,
"log_group_name": self._log_group.log_group_name,
"stack_name": Aws.STACK_NAME,
"node_type": "ExternalSlurmDbd",
"cw_logging_enabled": "true",
"slurmdbd_service_enabled": self.enable_slurm_dbd_system_service.value_as_string,
},
}
return {
"configSets": {"default": ["setup"]},
"setup": {
"files": {
"/etc/chef/dna.json": {
"content": json.dumps(dna_json_content),
"mode": "000644",
"owner": "root",
"group": "root",
}
},
"commands": {
"chef": {
"command": (
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info "
"--logfile /var/log/chef-client.log --force-formatter --no-color "
"--chef-zero-port 8889 --json-attributes /etc/chef/dna.json "
"--override-runlist aws-parallelcluster-entrypoints::external_slurmdbd_config"
),
"cwd": "/etc/chef",
}
},
},
}
def _add_management_security_groups(self):
server_sg = ec2.CfnSecurityGroup(
self,
"SSHServerSecurityGroup",
group_description="Allow SSH access to slurmdbd instance (server)",
vpc_id=self.vpc_id.value_as_string,
)
client_sg = ec2.CfnSecurityGroup(
self,
"SSHClientSecurityGroup",
group_description="Allow SSH access to slurmdbd instance (client)",
vpc_id=self.vpc_id.value_as_string,
)
ec2.CfnSecurityGroupIngress(
self,
"Allow SSH access from client SG",
ip_protocol="tcp",
from_port=22,
to_port=22,
source_security_group_id=client_sg.ref,
group_id=server_sg.ref,
)
return server_sg, client_sg
# FIXME: make the ingress rules more configurable
def _add_slurmdbd_accounting_security_groups(self):
slurmdbd_server_sg = ec2.CfnSecurityGroup(
self,
"SlurmdbdServerSecurityGroup",
group_description="Allow Slurm accounting traffic to the slurmdbd instance (server)",
vpc_id=self.vpc_id.value_as_string,
)
slurmdbd_client_sg = ec2.CfnSecurityGroup(
self,
"SlurmdbdClientSecurityGroup",
group_description="Allow Slurm accounting traffic from the cluster head node (client)",
vpc_id=self.vpc_id.value_as_string,
)
self.slurmdbd_port = CfnParameter(
self,
"SlurmdbdPort",
type="Number",
description="The port the slurmdbd service listens to.",
default=6819,
)
ec2.CfnSecurityGroupIngress(
self,
"Allow Slurm accounting traffic from the cluster head node",
ip_protocol="tcp",
from_port=self.slurmdbd_port.value_as_number,
to_port=self.slurmdbd_port.value_as_number,
source_security_group_id=slurmdbd_client_sg.ref,
group_id=slurmdbd_server_sg.ref,
)
ec2.CfnSecurityGroupIngress(
self,
"Allow traffic coming from slurmdbd instance",
ip_protocol="tcp",
from_port=6820,
to_port=6829,
source_security_group_id=slurmdbd_server_sg.ref,
group_id=slurmdbd_client_sg.ref,
)
return slurmdbd_server_sg, slurmdbd_client_sg
def _add_instance_profile(self, role_ref: str, name: str):
return iam.CfnInstanceProfile(
self,
name,
roles=[role_ref],
).ref
def _add_external_slurmdbd_launch_template(self):
# Define a CfnParameter for the AMI ID
# This AMI should be Parallel Cluster AMI, which has installed Slurm and related software
ami_id_param = CfnParameter(
self, "AmiId", type="AWS::EC2::Image::Id", description="The AMI id for the EC2 instance."
)
instance_type_param = CfnParameter(
self,
"InstanceType",
type="String",
description="The instance type for the EC2 instance",
)
key_name_param = CfnParameter(
self,
"KeyName",
type="AWS::EC2::KeyPair::KeyName",
description="The SSH key name to access the instance (for management purposes only)",
)
self.slurmdbd_private_ip = CfnParameter(
self,
"PrivateIp",
type="String",
description="Static private IP address + prefix to assign to the slurmdbd instance",
)
self.slurmdbd_private_prefix = CfnParameter(
self,
"PrivatePrefix",
type="String",
description="Subnet prefix to assign with the private IP to the slurmdbd instance",
)
dbms_client_sg_id = CfnParameter(
self, "DBMSClientSG", type="AWS::EC2::SecurityGroup::Id", description="DBMS Client Security Group Id"
)
launch_template_data = ec2.CfnLaunchTemplate.LaunchTemplateDataProperty(
key_name=key_name_param.value_as_string,
image_id=ami_id_param.value_as_string,
instance_type=instance_type_param.value_as_string,
user_data=Fn.base64(
Fn.sub(
get_user_data_content("../resources/user_data.sh"),
{
**{
"CustomCookbookUrl": self.custom_cookbook_url_param.value_as_string,
"StackName": Aws.STACK_NAME,
"Region": self.region,
"PrivateIp": self.slurmdbd_private_ip.value_as_string,
"SubnetPrefix": self.slurmdbd_private_prefix.value_as_string,
},
},
)
),
iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(name=self._instance_profile),
network_interfaces=[
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
device_index=0,
groups=[
self._ssh_server_sg.ref,
self._slurmdbd_server_sg.ref,
dbms_client_sg_id.value_as_string,
],
subnet_id=self.subnet_id.value_as_string,
),
],
metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty(
http_tokens="required",
),
)
launch_template = ec2.CfnLaunchTemplate(self, "LaunchTemplate", launch_template_data=launch_template_data)
self._cfn_init_config = self._add_cfn_init_config()
launch_template.add_metadata("AWS::CloudFormation::Init", self._cfn_init_config)
return launch_template
def _add_slurmdbd_primary_instance(self):
return ec2.CfnInstance(
self,
id="ExternalSlurmdbdPrimaryInstance",
launch_template=ec2.CfnInstance.LaunchTemplateSpecificationProperty(
version=self._launch_template.attr_latest_version_number, launch_template_id=self._launch_template.ref
),
)
def _add_external_slurmdbd_auto_scaling_group(self):
return autoscaling.CfnAutoScalingGroup(
self,
"External-Slurmdbd-ASG",
max_size=EXTERNAL_SLURMDBD_ASG_SIZE,
min_size=EXTERNAL_SLURMDBD_ASG_SIZE,
desired_capacity=EXTERNAL_SLURMDBD_ASG_SIZE,
launch_template=autoscaling.CfnAutoScalingGroup.LaunchTemplateSpecificationProperty(
version=self._launch_template.attr_latest_version_number, launch_template_id=self._launch_template.ref
),
vpc_zone_identifier=[self.subnet_id.value_as_string],
)
def _add_iam_role(self):
role = iam.CfnRole(
self,
"SlurmdbdInstanceRole",
assume_role_policy_document=get_assume_role_policy_document("ec2.{0}".format(self.url_suffix)),
description="Role for Slurmdbd EC2 instance to access necessary AWS resources",
)
iam.CfnPolicy(
Stack.of(self),
"ExternalSlurmdbdPolicies",
policy_name="ExternalSlurmdbdPolicies",
roles=[role.ref],
policy_document=iam.PolicyDocument(
statements=[
iam.PolicyStatement(
actions=["secretsmanager:GetSecretValue"],
resources=[
self.dbms_password_secret_arn.value_as_string,
self.munge_key_secret_arn.value_as_string,
],
effect=iam.Effect.ALLOW,
sid="SecretsManagerPolicy",
),
iam.PolicyStatement(
actions=["logs:CreateLogStream", "logs:PutLogEvents"],
resources=[self._log_group.log_group_arn],
effect=iam.Effect.ALLOW,
sid="CloudWatchLogsPolicy",
),
iam.PolicyStatement(
actions=["ec2:AssignPrivateIpAddresses"],
resources=["*"],
effect=iam.Effect.ALLOW,
conditions={"StringLike": {"ec2:Subnet": f"*{self.subnet_id.value_as_string}"}},
sid="IPAssignmentPolicy",
),
iam.PolicyStatement(
actions=[
"s3:ListBucket",
],
resources=[self.s3_bucket.attr_arn],
effect=iam.Effect.ALLOW,
sid="S3BucketPolicy",
),
iam.PolicyStatement(
actions=[
"s3:GetObject",
"s3:PutObject",
"s3:AbortMultipartUpload",
"s3:DeleteObject",
],
resources=[self.s3_bucket.attr_arn + "/*"],
effect=iam.Effect.ALLOW,
sid="S3BucketObjectsPolicy",
),
# iam.PolicyStatement(
# actions=[
# "route53:CreateHostedZone",
# "route53:DeleteHostedZone",
# ],
# resources=[slurmdbd_hosted_zone.value_as_string],
# effect=iam.Effect.ALLOW,
# sid="IPAssignmentPolicy",
# ),
]
),
)
return role
def _add_cloudwatch_log_group(self):
# Create a new CloudWatch log group
return logs.LogGroup(
self,
"SlurmdbdLogGroup",
log_group_name=Fn.join(
"-",
[
f"/aws/parallelcluster/external-slurmdbd/{Aws.STACK_NAME}",
Fn.select(4, Fn.split("-", Fn.select(2, Fn.split("/", self.stack_id)))),
],
),
retention=logs.RetentionDays.ONE_WEEK,
)
def _add_s3_bucket(self):
return s3.CfnBucket(
self,
id="ExternalSlurmdbdS3Bucket",
public_access_block_configuration=s3.CfnBucket.PublicAccessBlockConfigurationProperty(
block_public_acls=True,
block_public_policy=True,
ignore_public_acls=True,
restrict_public_buckets=True,
),
versioning_configuration=s3.CfnBucket.VersioningConfigurationProperty(status="Enabled"),
)
def _add_hosted_zone(self):
return route53.CfnHostedZone(
self,
id="ExternalSlurmdbdHostedZone",
name="externalslurmdbdhostedzone",
vpcs=[
route53.CfnHostedZone.VPCProperty(
vpc_id=self.vpc_id.value_as_string,
vpc_region=self.region,
)
],
)
def _add_instance_to_dns(self, ip_addr, name):
route53.CfnRecordSet(
self,
"ExternalSlurmdbdRecordSet",
name=(name + "." + self._hosted_zone.name),
type="A",
hosted_zone_id=self._hosted_zone.attr_id,
region=self.region,
resource_records=[ip_addr],
set_identifier="externalslurmdbdsetidentifier",
ttl="300",
)
def _add_outputs(self):
CfnOutput(
self,
"SlurmdbdPrivateIp",
description="Secondary Private IP Address of the slurmdbd instance",
value=self.slurmdbd_private_ip.value_as_string,
)
CfnOutput(
self,
"SlurmdbdPortOutput",
description="Port used to connect to slurmdbd service",
key="SlurmdbdPort",
value=self.slurmdbd_port.value_as_string,
)
CfnOutput(
self,
"AccountingClientSecurityGroup",
description="Security Group ID that allows traffic from the slurmctld to slurmdbd",
value=self._slurmdbd_client_sg.ref,
)
CfnOutput(
self,
"SshClientSecurityGroup",
description="Security Group ID that allows SSH traffic from the HeadNode to slurmdbd instance",
value=self._ssh_client_sg.ref,
)
CfnOutput(
self,
"SlurmdbdConfigS3BucketName",
description="S3 Bucket where a copy of the slurmdbd configuration files can be stored and re-used when "
"re-provisioning the slurmdbd instance",
value=self.s3_bucket.ref,
)