cookbooks/aws-parallelcluster-slurm/recipes/update/update_head

# frozen_string_literal: true # # Cookbook:: aws-parallelcluster-slurm # Recipe:: update_head_node # # Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. execute 'stop clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl stop clustermgtd" not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end ruby_block "update_shared_storages" do block do run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages' end only_if { are_mount_or_unmount_required? } end ruby_block "replace slurm queue nodes" do SLURM_POWER_SAVING_MAPPING = { DRAIN: "POWER_DOWN_ASAP", TERMINATE: "POWER_DOWN_FORCE", }.freeze def get_slurm_nodelist(queue) # # Example content for a slurm_parallelcluster_#{queue}_partition.conf # # NodeName=compute1-st-compute1-i1-[1-1] CPUs=16 State=CLOUD Feature=static,c5.4xlarge,compute1-i1 # NodeName=compute1-dy-compute1-i1-[1-9] CPUs=16 State=CLOUD Feature=dynamic,c5.4xlarge,compute1-i1 # # NodeSet=compute1_nodes Nodes=compute1-st-compute1-i1-[1-1],compute1-dy-compute1-i1-[1-9] # PartitionName=compute1 Nodes=compute1_nodes MaxTime=INFINITE State=UP # command = "sed -n 's/.*NodeSet.*Nodes=\\(.*\\)/\\1/p' #{node['cluster']['slurm']['install_dir']}/etc/pcluster/slurm_parallelcluster_#{queue}_partition.conf" Chef::Log.debug("Retrieving nodelist with command (#{command})") node_list = execute_command(command) Chef::Log.info("Node list for queue (#{queue}) is (#{node_list})") node_list end def update_slurm_nodes(state, nodelist) command = "sudo -i scontrol update state=#{state} nodename=#{nodelist} reason='updating node state during cluster update'" Chef::Log.info("Executing node state update with command (#{command})") execute_command(command) end def split_static_and_dynamic_nodes(nodelist) static_nodes = [] dynamic_nodes = [] nodes = nodelist.split(',') nodes.each do |node| if is_static_node?(node) static_nodes.push(node) else dynamic_nodes.push(node) end end [static_nodes, dynamic_nodes] end def update_nodes(strategy, nodelist) if strategy == "DRAIN" static_nodes, dynamic_nodes = split_static_and_dynamic_nodes(nodelist) # Set static nodes to DRAIN to keep clustermgtd in charge of managing static nodes lifecycle update_slurm_nodes(strategy, static_nodes.join(",")) if static_nodes.any? update_slurm_nodes(SLURM_POWER_SAVING_MAPPING[strategy.to_sym], dynamic_nodes.join(",")) if dynamic_nodes.any? elsif strategy == "TERMINATE" update_slurm_nodes(SLURM_POWER_SAVING_MAPPING[strategy.to_sym], nodelist) end end def get_all_queues(config) # Get all queue names from the cluster config slurm_queues = config.dig("Scheduling", "SlurmQueues") queues_name = Set.new slurm_queues.each do |queue| queues_name.add(queue["Name"]) end queues_name end def get_queues_with_changes(config) # Load change set and find queue with changes that require nodes to be updated. # If the changeset contains only changes that support live updates, returns an an empty set of queues. # If the changeset is empty, returns an an empty set of queues. queues = Set.new change_set = JSON.load_file("#{node['cluster']['shared_dir']}/change-set.json") changes = change_set["changeSet"] # Changes to the shared storage are applied to all queues, # but only changes not supporting live updates will be considered to update the queues. if are_mount_or_unmount_required? && !storage_change_supports_live_update? queues = get_all_queues(config) Chef::Log.info("All queues will be updated in order to update shared storages") else changes.each do |change| next unless change["updatePolicy"] == "QUEUE_UPDATE_STRATEGY" queue = change["parameter"][/Scheduling\.SlurmQueues\[([^\]]*)\]/, 1] Chef::Log.info("Adding queue (#{queue}) to list of queue to be updated") queues.add(queue) end end queues end def update_nodes_in_queue(strategy, queues) # Update state for nodes in queue with changes if queues.empty? Chef::Log.info("No queue to be replaced found") else queues.each do |queue| node_list = get_slurm_nodelist(queue) Chef::Log.info("Updating node state for queue (#{queue})") update_nodes(strategy, node_list) end end end block do # Load queue update strategy from cluster config config = YAML.safe_load(File.read(node['cluster']['cluster_config_path'])) queue_update_strategy = config.dig("Scheduling", "SlurmSettings", "QueueUpdateStrategy") Chef::Log.debug("Found queue update strategy value (#{queue_update_strategy})") if !queue_update_strategy.nil? && !queue_update_strategy.empty? # Act based on queue update strategy value case queue_update_strategy when "COMPUTE_FLEET_STOP" Chef::Log.info("Queue update strategy is #{queue_update_strategy}, doing nothing") when "DRAIN", "TERMINATE" Chef::Log.info("Queue update strategy is #{queue_update_strategy}") queues = get_queues_with_changes(config) update_nodes_in_queue(queue_update_strategy, queues) else Chef::Log.info("Queue update strategy not managed, no-op") end end end end execute "generate_pcluster_slurm_configs" do command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \ " --output-directory #{node['cluster']['slurm']['install_dir']}/etc/" \ " --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/" \ " --input-file #{node['cluster']['cluster_config_path']}" \ " --instance-types-data #{node['cluster']['instance_types_data_path']}" \ " --compute-node-bootstrap-timeout #{node['cluster']['compute_node_bootstrap_timeout']}" \ " #{nvidia_installed? ? '' : '--no-gpu'}"\ " --realmemory-to-ec2memory-ratio #{node['cluster']['realmemory_to_ec2memory_ratio']}"\ " --slurmdbd-user #{node['cluster']['slurm']['user']}"\ " --cluster-name #{node['cluster']['stack_name']}" not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? } end # Generate custom Slurm settings include files execute "generate_pcluster_custom_slurm_settings_include_files" do command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_custom_slurm_settings_include_file_generator.py" \ " --output-directory #{node['cluster']['slurm']['install_dir']}/etc/"\ " --input-file #{node['cluster']['cluster_config_path']}" not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_bulk_custom_slurm_settings_updated? } end # If defined in the config, retrieve a remote Custom Slurm Settings file and overrides the existing one ruby_block "Override Custom Slurm Settings with remote file" do block do run_context.include_recipe 'aws-parallelcluster-slurm::retrieve_remote_custom_settings_file' end not_if { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :CustomSlurmSettingsIncludeFile).nil? } end execute "generate_pcluster_fleet_config" do command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_fleet_config_generator.py"\ " --output-file #{node['cluster']['slurm']['fleet_config_path']}"\ " --input-file #{node['cluster']['cluster_config_path']}" not_if { ::File.exist?(node['cluster']['slurm']['fleet_config_path']) && !are_queues_updated? } end replace_or_add "update node replacement timeout" do path "#{node['cluster']['etc_dir']}/slurm_plugin/parallelcluster_clustermgtd.conf" pattern "node_replacement_timeout*" line "node_replacement_timeout = #{node['cluster']['compute_node_bootstrap_timeout']}" replace_only true end ruby_block "Update Slurm Accounting" do block do if node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? run_context.include_recipe "aws-parallelcluster-slurm::clear_slurm_accounting" else run_context.include_recipe "aws-parallelcluster-slurm::config_slurm_accounting" end end only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? } end unless on_docker? # Cover the following two scenarios: # - a cluster without login nodes is updated to have login nodes; # - a cluster with login nodes is updated to use another pool name. if ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_login_nodes_pool_name_updated? include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script' end file "#{node['cluster']['scripts_dir']}/slurm/check_login_nodes_stopped.sh" do action :delete only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_login_nodes_removed? } end # Update munge key rotation script to update secret arn template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do source 'slurm/head_node/update_munge_key.sh.erb' owner 'root' group 'root' mode '0700' variables( munge_key_secret_arn: lazy { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :MungeKeySecretArn) } ) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } end update_munge_head_node # The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in # slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting # is enabled we must pull the database password from Secrets Manager once again. execute "update Slurm database password" do user 'root' group 'root' command "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh" # This horrible only_if guard is needed to cover all cases that trigger "generate_pcluster_slurm_settings", in the case Slurm accounting is being used only_if { !(::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated?) && !node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :Database).nil? } end service 'slurmctld' do action :restart not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end chef_sleep '5' # The slurmctld service does not return an error code to `systemctl start slurmctld`, so # we must explicitly check the status of the service to capture failures execute "check slurmctld status" do command "systemctl is-active --quiet slurmctld.service" retries 5 retry_delay 2 end execute 'reload config for running nodes' do command "#{node['cluster']['slurm']['install_dir']}/bin/scontrol reconfigure" retries 3 retry_delay 5 timeout 300 not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end chef_sleep '15' wait_cluster_ready execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end # The updated cfnconfig will be used by post update custom scripts template "#{node['cluster']['etc_dir']}/cfnconfig" do source 'init/cfnconfig.erb' cookbook 'aws-parallelcluster-environment' mode '0644' end fetch_dna_files 'Cleanup' do action :cleanup end

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb (205 lines of code) (raw):