cookbooks/aws-parallelcluster-platform/libraries/write_chef_error_handler.rb (44 lines of code) (raw):
# frozen_string_literal: true
#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
require 'chef/handler'
module WriteChefError
# this class is used to handle chef errors and write the errors into a certain file if the file does not exist yet
class WriteHeadNodeChefError < Chef::Handler
def report
extend Chef::Mixin::ShellOut
error_file = node['cluster']['bootstrap_error_path']
# to avoid overwriting the error message from other mechanisms, such as the deprecated BYOS handler
# if the error file already exists we don't take any additional action here
unless File.exist?(error_file)
message_error = 'Failed to run chef recipe.'
message_logs_to_check = \
'Please check /var/log/chef-client.log in the head node, or check the chef-client.log in CloudWatch logs.'
message_troubleshooting_link = 'Please refer to'\
' https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html'\
' for more details.'
# get the failed action records using the chef function filtered_collection
# reference: https://github.com/cinc-project/chef/blob/stable/cinc/lib/chef/action_collection.rb#L107
failed_action_collection = action_collection.filtered_collection(
up_to_date: false, skipped: false, updated: false, failed: true, unprocessed: false
)
# define a mapping from the mount-related resource name to the error message we would like to display
mount_message_mapping = {
"add ebs" => "Failed to mount EBS volume.",
"add raid" => "Failed to mount RAID array.",
"mount efs" => "Failed to mount EFS.",
"mount fsx" => "Failed to mount FSX.",
}
protected_mode_exception =
"Cluster has been set to PROTECTED mode due to failures detected in static node provisioning"
# define a mapping from the exception information to the error message we would like to display
exception_message_mapping = {
protected_mode_exception =>
protected_mode_exception + ".",
}
failed_action_collection.each do |action_record|
# there might be multiple failed action records
# here we only look at the outermost layer resource by setting nesting_level = 0
# with the assumption that there is only one failed action record with nesting_level = 0
next unless action_record.nesting_level == 0
# we first check if it is a storage mounting error for EBS, RAID, EFS, or FSX,
# otherwise we will get the recipe information
message_error = exception_message_mapping[
action_record.exception.message] || mount_message_mapping[
action_record.new_resource.name] || "Failed to run chef recipe#{get_recipe_info(action_record)}."
break
end
# at the end, put together and store the full error message into the dedicated file
shell_out("echo '#{message_error} #{message_logs_to_check} #{message_troubleshooting_link}'> '#{error_file}'")
end
end
def get_recipe_info(action_record)
# use the built-in function "defined_at" of Chef::Resource to get the recipe information
# when source_line is not available it will return "dynamically defined" and we replace it with empty string
# reference: https://github.com/cinc-project/chef/blob/stable/cinc/lib/chef/resource.rb#L1436
recipe_info = action_record.new_resource.defined_at
recipe_info == "dynamically defined" ? "" : " #{recipe_info}"
end
end
end