in community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py [0:0]
def _apply_terraform(self):
terraform_dir = self.get_terraform_dir()
# Create C&C Subscription
c2.create_cluster_subscription(self.cluster.id)
extra_env = {
"GOOGLE_APPLICATION_CREDENTIALS": self._get_credentials_file()
}
try:
logger.info("Invoking Terraform Apply")
utils.run_terraform(terraform_dir, "apply", extra_env=extra_env)
# Look for Management and Login Nodes in TF state file
tf_state_file = terraform_dir / "terraform.tfstate"
with tf_state_file.open("r") as statefp:
state = json.load(statefp)
# Apply Perms to the service accounts
try:
service_accounts = self._get_service_accounts(state)
self._apply_service_account_permissions(service_accounts)
except Exception as e:
# Be nicer to the user and continue creating cluster
logger.warning(f"An error occurred while applying permissions to service accounts: {e}")
# Cluster is now being initialized
self.cluster.internal_name = self.cluster.name
self.cluster.cloud_state = "m"
# Cluster initialization is now running.
self.cluster.status = "i"
self.cluster.save()
mgmt_nodes = self._create_model_instances_from_tf_state(
state,
{
"module": "module.slurm_controller.module.slurm_controller_instance", # pylint: disable=line-too-long
"name": "slurm_instance",
},
)
if len(mgmt_nodes) != 1:
logger.warning(
"Found %d controller nodes, there should be only 1",
len(mgmt_nodes),
)
if len(mgmt_nodes):
node = mgmt_nodes[0]
node.save()
self.cluster.controller_node = node
logger.info(
"Created cluster controller node with IP address %s",
node.public_ip if node.public_ip else node.internal_ip,
)
login_nodes = self._create_model_instances_from_tf_state(
state,
{
"module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', # pylint: disable=line-too-long
"name": "slurm_instance",
},
)
if len(login_nodes) != self.cluster.num_login_nodes:
logger.warning(
"Found %d login nodes, expected %d from config",
len(login_nodes),
self.cluster.num_login_nodes,
)
for lnode in login_nodes:
lnode.cluster_login = self.cluster
lnode.save()
logger.info(
"Created login node with IP address %s",
lnode.public_ip
if lnode.public_ip
else lnode.internal_ip,
)
# Set up Spack Install location
self._configure_spack_install_loc()
self.cluster.save()
except subprocess.CalledProcessError as err:
# We can error during provisioning, in which case Terraform
# doesn't tear things down.
logger.error("Terraform apply failed", exc_info=err)
if err.stdout:
logger.info("TF stdout:\n%s\n", err.stdout.decode("utf-8"))
if err.stderr:
logger.info("TF stderr:\n%s\n", err.stderr.decode("utf-8"))
raise