# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: config_head_node
#
# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Copy pcluster config generator and templates
include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
  source 'slurm/head_node/update_munge_key.sh.erb'
  owner 'root'
  group 'root'
  mode '0700'
  variables(
    munge_key_secret_arn: lazy { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :MungeKeySecretArn) }
  )
end

include_recipe 'aws-parallelcluster-slurm::config_munge_key'

# Export /opt/slurm
nfs_export "#{node['cluster']['slurm']['install_dir']}" do
  network get_vpc_cidr_list
  writeable true
  options ['no_root_squash']
  only_if { node['cluster']['shared_storage_type'] == 'ebs' }
end unless on_docker?

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do
  source 'slurm/slurm.conf.erb'
  owner 'root'
  group 'root'
  mode '0644'
end

template "#{node['cluster']['slurm']['install_dir']}/etc/gres.conf" do
  source 'slurm/gres.conf.erb'
  owner 'root'
  group 'root'
  mode '0644'
end

unless on_docker?
  # Generate pcluster specific configs
  no_gpu = nvidia_installed? ? "" : "--no-gpu"
  execute "generate_pcluster_slurm_configs" do
    command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
            " --output-directory #{node['cluster']['slurm']['install_dir']}/etc/"\
            " --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
            " --input-file #{node['cluster']['cluster_config_path']}"\
            " --instance-types-data #{node['cluster']['instance_types_data_path']}"\
            " --compute-node-bootstrap-timeout #{node['cluster']['compute_node_bootstrap_timeout']} #{no_gpu}"\
            " --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"\
            " --slurmdbd-user #{node['cluster']['slurm']['user']}"\
            " --cluster-name #{node['cluster']['stack_name']}"
  end

  # Generate custom Slurm settings include files
  execute "generate_pcluster_custom_slurm_settings_include_files" do
    command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_custom_slurm_settings_include_file_generator.py"\
            " --output-directory #{node['cluster']['slurm']['install_dir']}/etc/"\
            " --input-file #{node['cluster']['cluster_config_path']}"
  end

  # If defined in the config, retrieve a remote Custom Slurm Settings file and overrides the existing one
  ruby_block "Override Custom Slurm Settings with remote file" do
    block do
      run_context.include_recipe 'aws-parallelcluster-slurm::retrieve_remote_custom_settings_file'
    end
    not_if { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :CustomSlurmSettingsIncludeFile).nil? }
  end

  # Generate pcluster fleet config
  execute "generate_pcluster_fleet_config" do
    command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_fleet_config_generator.py"\
            " --output-file #{node['cluster']['slurm']['fleet_config_path']}"\
            " --input-file #{node['cluster']['cluster_config_path']}"
  end
end

# all other OSs use /sys/fs/cgroup, which is the default
template "#{node['cluster']['slurm']['install_dir']}/etc/cgroup.conf" do
  source 'slurm/cgroup.conf.erb'
  owner 'root'
  group 'root'
  mode '0644'
end

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.sh" do
  source 'slurm/head_node/slurm.sh.erb'
  owner 'root'
  group 'root'
  mode '0755'
end

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.csh" do
  source 'slurm/head_node/slurm.csh.erb'
  owner 'root'
  group 'root'
  mode '0755'
end

template "#{node['cluster']['scripts_dir']}/slurm/slurm_fleet_status_manager" do
  source 'slurm/fleet_status_manager_program.erb'
  owner node['cluster']['slurm']['user']
  group node['cluster']['slurm']['group']
  mode '0744'
end

file "/var/log/parallelcluster/slurm_fleet_status_manager.log" do
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0640'
end

file "/var/log/parallelcluster/clustermgtd.events" do
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0600'
end

file "/var/log/parallelcluster/compute_console_output.log" do
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0600'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_fleet_status_manager.conf" do
  source 'slurm/parallelcluster_slurm_fleet_status_manager.conf.erb'
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0644'
end

include_recipe 'aws-parallelcluster-slurm::config_slurm_resume'

template "#{node['cluster']['scripts_dir']}/slurm/slurm_suspend" do
  source 'slurm/suspend_program.erb'
  owner node['cluster']['slurm']['user']
  group node['cluster']['slurm']['group']
  mode '0744'
end

file "/var/log/parallelcluster/slurm_suspend.log" do
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_suspend.conf" do
  source 'slurm/parallelcluster_slurm_suspend.conf.erb'
  owner node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_clustermgtd.conf" do
  source 'slurm/parallelcluster_clustermgtd.conf.erb'
  owner 'root'
  group 'root'
  mode '0644'
end unless on_docker?

# Create shared directory used to store clustermgtd heartbeat and computemgtd config
directory "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin" do
  user node['cluster']['cluster_admin_user']
  group node['cluster']['cluster_admin_group']
  mode '0755'
  action :create
  recursive true
end

# Put computemgtd config under /opt/slurm/etc/pcluster/.slurm_plugin so all compute nodes share a config
template "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin/parallelcluster_computemgtd.conf" do
  source 'slurm/parallelcluster_computemgtd.conf.erb'
  owner 'root'
  group 'root'
  mode '0644'
end

include_recipe 'aws-parallelcluster-slurm::config_slurmctld_systemd_service'
include_recipe 'aws-parallelcluster-slurm::config_health_check'

ruby_block "Configure Slurm Accounting" do
  block do
    run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting"
  end
  not_if { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? }
end unless on_docker?

service "slurmctld" do
  supports restart: false
  action %i(enable start)
end unless on_docker?

# The slurmctld service does not return an error code to `systemctl start slurmctld`, so
# we must explicitly check the status of the service to capture failures
chef_sleep 3

execute "check slurmctld status" do
  command "systemctl is-active --quiet slurmctld.service"
  retries 5
  retry_delay 2
end unless redhat_on_docker?
