cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb (155 lines of code) (raw):
# frozen_string_literal: true
# Copyright:: 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
#
# Retrieve compute nodename from file
#
require 'digest'
def slurm_nodename
slurm_nodename_file = "#{node['cluster']['slurm_plugin_dir']}/slurm_nodename"
IO.read(slurm_nodename_file).chomp
end
#
# Retrieve compute and head node info from dynamo db (Slurm only)
#
def dynamodb_info(aws_connection_timeout_seconds: 30, aws_read_timeout_seconds: 60, shell_timout_seconds: 300)
output = Mixlib::ShellOut.new("#{cookbook_virtualenv_path}/bin/aws dynamodb " \
"--region #{node['cluster']['region']} query --table-name #{node['cluster']['slurm_ddb_table']} " \
"--index-name InstanceId --key-condition-expression 'InstanceId = :instanceid' " \
"--expression-attribute-values '{\":instanceid\": {\"S\":\"#{node['ec2']['instance_id']}\"}}' " \
"--projection-expression 'Id' " \
"--cli-connect-timeout #{aws_connection_timeout_seconds} " \
"--cli-read-timeout #{aws_read_timeout_seconds} " \
"--output text --query 'Items[0].[Id.S]'",
user: 'root',
timeout: shell_timout_seconds).run_command.stdout.strip
raise "Failed when retrieving Compute info from DynamoDB" if output.nil? || output.empty? || output == "None"
slurm_nodename = output
Chef::Log.info("Retrieved Slurm nodename is: #{slurm_nodename}")
slurm_nodename
end
#
# Verify if a given node name is a static node or a dynamic one (Slurm only)
#
def is_static_node?(nodename)
# Match queue1-st-compute2-1 or queue1-st-compute2-[1-1000] format
match = nodename.match(/^([a-z0-9\-]+)-(st|dy)-([a-z0-9\-]+)-\[?\d+[\-\d+]*\]?$/)
raise "Failed when parsing Compute nodename: #{nodename}" if match.nil?
match[2] == "st"
end
def enable_munge_service
service "munge" do
supports restart: true
action %i(enable start)
retries 5
retry_delay 10
end unless on_docker?
end
def setup_munge_head_node
# Generate munge key or get it's value from secrets manager
munge_key_manager 'manage_munge_key' do
munge_key_secret_arn lazy {
node['munge_key_secret_arn'] || node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :MungeKeySecretArn)
}
end
end
def update_munge_head_node
munge_key_manager 'update_munge_key' do
munge_key_secret_arn lazy { node['cluster']['config'].dig(:Scheduling, :SlurmSettings, :MungeKeySecretArn) }
action :update_munge_key
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end
end
def setup_munge_key(shared_dir)
bash 'get_munge_key' do
user 'root'
group 'root'
code <<-MUNGE_KEY
set -e
# Copy munge key from shared dir
cp #{shared_dir}/.munge/.munge.key /etc/munge/munge.key
# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
MUNGE_KEY
retries 5
retry_delay 10
end
end
def setup_munge_compute_node
if kitchen_test?
# FIXME: Mock munge key in shared directory.
include_recipe 'aws-parallelcluster-slurm::mock_munge_key'
end
setup_munge_key(node['cluster']['shared_dir'])
enable_munge_service
end
def setup_munge_login_node
setup_munge_key(node['cluster']['shared_dir_login'])
enable_munge_service
end
def get_primary_ip
primary_ip = node['ec2']['local_ipv4']
# TODO: We should use instance info stored in node['ec2'] by Ohai, rather than calling IMDS.
# We cannot use MAC related data because we noticed a mismatch in the info returned by Ohai and IMDS.
# In particular, the data returned by Ohai is missing the 'network-card' information.
token = get_metadata_token
macs = network_interface_macs(token)
if macs.length > 1
macs.each do |mac|
mac_metadata_uri = "http://169.254.169.254/latest/meta-data/network/interfaces/macs/#{mac}"
device_number = get_metadata_with_token(token, URI("#{mac_metadata_uri}/device-number"))
network_card = get_metadata_with_token(token, URI("#{mac_metadata_uri}/network-card"))
next unless device_number == '0' && network_card == '0'
primary_ip = get_metadata_with_token(token, URI("#{mac_metadata_uri}/local-ipv4s"))
break
end
end
primary_ip
end
def get_target_group_name(cluster_name, pool_name)
partial_cluster_name = cluster_name[0..6]
partial_pool_name = pool_name[0..6]
combined_name = cluster_name + pool_name
hash_value = Digest::SHA256.hexdigest(combined_name)[0..15]
"#{partial_cluster_name}-#{partial_pool_name}-#{hash_value}"
end
def validate_file_hash(file_path, expected_hash)
hash_function = yield
checksum = hash_function.file(file_path).hexdigest
if checksum != expected_hash
raise "Downloaded file #{file_path} checksum #{checksum} does not match expected checksum #{expected_hash}"
end
end
def validate_file_md5_hash(file_path, expected_hash)
validate_file_hash(file_path, expected_hash) do
require 'digest'
Digest::MD5
end
end
def wait_cluster_ready
return if on_docker? || kitchen_test? && !node['interact_with_ddb']
execute "Check cluster readiness" do
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/head_node_checks/check_cluster_ready.py" \
" --cluster-name #{node['cluster']['stack_name']}" \
" --table-name parallelcluster-#{node['cluster']['stack_name']}" \
" --config-version #{node['cluster']['cluster_config_version']}" \
" --region #{node['cluster']['region']}"
timeout 30
retries 10
retry_delay 90
end
end
def wait_static_fleet_running
ruby_block "wait for static fleet capacity" do
block do
require 'chef/mixin/shell_out'
require 'shellwords'
require 'json'
def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/NestedMethodDefinition
begin
cluster_state_json = shell_out!("/bin/bash -c #{fleet_status_command}").stdout.strip
cluster_state = JSON.load(cluster_state_json)
rescue
Chef::Log.warn("Unable to get compute fleet status")
return
end
Chef::Log.info("Compute fleet status is empty") if cluster_state.empty?
return if cluster_state.empty?
raise "Cluster has been set to PROTECTED mode due to failures detected in static node provisioning" if cluster_state["status"] == "PROTECTED"
end
fleet_status_command = Shellwords.escape(
"/usr/local/bin/get-compute-fleet-status.sh"
)
# Example output for sinfo
# sinfo -h -o '%N %t'
# queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
# queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
check_for_protected_mode(fleet_status_command)
Chef::Log.info("Waiting for static fleet capacity provisioning")
sleep(15)
end
Chef::Log.info("Static fleet capacity is ready")
end
end
end
def get_login_node_pool_config(config, pool_name)
config['LoginNodes']['Pools'].select { |pool| pool['Name'] == pool_name }.first
end