in managementnode/lib/VCL/Module/Provisioning/xCAT.pm [181:454]
sub load {
my $self = shift;
if (ref($self) !~ /xCAT/i) {
notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
return;
}
# Get the data
my $reservation_id = $self->data->get_reservation_id();
my $image_name = $self->data->get_image_name();
my $image_reload_time_minutes = $self->data->get_image_reload_time() || 10;
my $computer_id = $self->data->get_computer_id();
my $computer_node_name = $self->data->get_computer_node_name();
my $management_node_hostname = $self->data->get_management_node_hostname();
insertloadlog($reservation_id, $computer_id, "startload", "$computer_node_name $image_name");
# Insert a computerloadlog record and edit nodetype table to set the image information for the computer
insertloadlog($reservation_id, $computer_id, "editnodetype", "updating nodetype table");
$self->_edit_nodetype($computer_node_name, $image_name) || return;
# Insert a computerloadlog record and edit nodelist table to set the xCAT groups for the computer
$self->_edit_nodelist($computer_node_name, $image_name) || return;
# Check to see if management node throttle is configured
my $throttle_limit = get_variable("xcat|throttle|$management_node_hostname", 0) || get_variable("$management_node_hostname|xcat|throttle", 0) || get_variable("xcat|throttle", 0);
if (!$throttle_limit || $throttle_limit !~ /^\d+$/) {
$throttle_limit = 10;
notify($ERRORS{'DEBUG'}, 0, "xCAT load throttle limit variable is NOT set in database: 'xcat|throttle', using default value: $throttle_limit");
}
else {
notify($ERRORS{'DEBUG'}, 0, "xCAT load throttle limit variable is set in database: $throttle_limit");
}
my $throttle_limit_wait_seconds = (30 * 60);
if (!$self->code_loop_timeout(sub{!$self->_is_throttle_limit_reached(@_)}, [$throttle_limit], 'checking throttle limit', $throttle_limit_wait_seconds, 1, 10)) {
notify($ERRORS{'WARNING'}, 0, "failed to load image due to throttle limit, waited $throttle_limit_wait_seconds seconds");
return;
}
# Set the computer to install on next boot
$self->_nodeset($computer_node_name, 'install') || return;
# Restart the node
$self->power_reset($computer_node_name) || return;
# Run lsdef to retrieve the node's configuration including its MAC address
my $node_info = $self->_lsdef($computer_node_name);
if (!$node_info) {
notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, failed to retrieve node info");
return;
}
my $mac_address = $node_info->{mac};
if ($mac_address) {
notify($ERRORS{'DEBUG'}, 0, "retrieved MAC address of $computer_node_name: $mac_address");
}
else {
notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, node info does not contain the MAC address:\n" . format_data($node_info));
return;
}
# nodeset changes xCAT state to 'install'
# node is power cycled or powered on (nodeset/nodestat status: install/noping)
# Wait for node to boot from network (may take from 30 seconds to several minutes if node is using UEFI)
# In /var/log/messages:, node makes DHCP request & requests PXE boot information from DHCP server running on management node:
# Apr 1 09:36:39 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:36:39 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:36:43 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:36:43 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
#
# Node requests PXE boot files from TFTP server running on management node:
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2070
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2071
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/xx-xx-xx-xx-xx-xx to 10.yy.yy.yy:57089
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/0A0A0132 to 10.yy.yy.yy:57090
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/vmlinuz to 10.yy.yy.yy:57091
# Apr 1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/initrd.img to 10.yy.yy.yy:57092
#
# Node boots using files downloaded from TFTP/PXE server, makes another DHCP request:
# Apr 1 09:37:15 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:37:15 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:37:15 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
# Apr 1 09:37:15 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
# OS installation begins (nodeset/nodestat status: install/installing prep)
# If Kickstart, Linux packages are installed (nodestat status: 'installing <package> (x%)')
# If Kickstart, postscripts are installed (nodestat status: 'installing post scripts')
# When installation is complete, xCAT status is changed to 'boot' and node is restarted (nodeset/nodestat status: boot/noping)
# Node boots from hard drive (nodeset/nodestat status: boot/boot)
# Open the /var/log/messages file for reading
my $messages_file_path = '/var/log/messages';
my $log = IO::File->new($messages_file_path, "r");
if (!$log) {
my $error = $! || 'none';
notify($ERRORS{'WARNING'}, 0, "failed to open $messages_file_path for reading, error: $error");
return;
}
# Go to the end of the messages file
if (!$log->seek(0, SEEK_END)) {
my $error = $! || 'none';
notify($ERRORS{'CRITICAL'}, 0, "failed to seek end of $messages_file_path, error: $error");
}
insertloadlog($reservation_id, $computer_id, "xcatstage5", "loading image $image_name");
if ($image_reload_time_minutes < 10) {
$image_reload_time_minutes = 10;
}
my $nochange_timeout_seconds = ($image_reload_time_minutes * 60);
my $monitor_start_time = time;
my $last_change_time = $monitor_start_time;
my $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
# Sanity check, timeout the load monitoring after a set amount of time
# This is done in case there is an endless loop which causes the node status to change over and over again
# Overall timeout is the lesser of 60 minutes or 2x image reload time
my $overall_timeout_minutes;
if ($image_reload_time_minutes < 30) {
$overall_timeout_minutes = 60;
}
else {
$overall_timeout_minutes = ($image_reload_time_minutes * 2);
}
my $overall_timeout_time = ($monitor_start_time + $overall_timeout_minutes * 60);
# Number of seconds to wait between checks
# Set to a short delay at the beginning of monitoring, this will be increased once installation start is detected
my $monitor_delay_seconds = 5;
# Keep track of when reservation.lastcheck was last updated
my $update_lastcheck_interval_seconds = 60;
my $update_lastcheck_time = time;
update_reservation_lastcheck($reservation_id);
my $previous_nodestat_status;
my $previous_nodeset_status;
my $current_time;
my $install_started = 0;
my $dhcp_ack = 0;
MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
my $total_elapsed_seconds = ($current_time - $monitor_start_time);
my $nochange_elapsed_seconds = ($current_time - $last_change_time);
my $nochange_remaining_seconds = ($nochange_timeout_time - $current_time);
my $overall_remaining_seconds = ($overall_timeout_time - $current_time);
notify($ERRORS{'DEBUG'}, 0, "monitoring $image_name loading on $computer_node_name\n" .
"seconds since monitor start/until unconditional timeout: $total_elapsed_seconds/$overall_remaining_seconds\n" .
"seconds since last change/until no change timeout: $nochange_elapsed_seconds/$nochange_remaining_seconds"
);
# Flag to set if anything changes
my $reset_timeout = 0;
# Check if any lines have shown in in /var/log/messages for the node
my @lines = $log->getlines;
my @dhcp_lines = grep(/dhcpd:.+DHCP.+\s$mac_address\s/i, @lines);
if (@dhcp_lines) {
if (grep(/DHCPREQUEST/i, @dhcp_lines)) {
insertloadlog($reservation_id, $computer_id, "xcatstage1", "requested DHCP lease");
}
if (my ($dhcpack_line) = grep(/DHCPACK/i, @dhcp_lines)) {
notify($ERRORS{'DEBUG'}, 0, "$computer_node_name acquired DHCP lease: '$dhcpack_line'");
if (!$dhcp_ack) {
insertloadlog($reservation_id, $computer_id, "xcatstage2", "acquired DHCP lease");
insertloadlog($reservation_id, $computer_id, "xcatround2", "waiting for boot flag");
$dhcp_ack=1;
}
}
$reset_timeout = 1;
notify($ERRORS{'DEBUG'}, 0, "DHCP activity detected in $messages_file_path:\n" . join("\n", @dhcp_lines));
}
# Get the current status of the node
# Set previous status to current status if this is the first iteration
my $current_nodestat_status = $self->_nodestat($computer_node_name);
$previous_nodestat_status = $current_nodestat_status if !defined($previous_nodestat_status);
my $current_nodeset_status = $self->_nodeset($computer_node_name, 'stat');
$previous_nodeset_status = $current_nodeset_status if !defined($previous_nodeset_status);
if (!$install_started) {
# Check if the installation has started
if ($current_nodestat_status =~ /(install|partimage)/i) {
# Slow down the monitor looping
$monitor_delay_seconds = 20;
notify($ERRORS{'DEBUG'}, 0, "installation has started, increasing wait between monitoring checks to $monitor_delay_seconds seconds");
$install_started = 1;
}
# If installation start was missed, nodeset will go from install to boot
if ($previous_nodeset_status =~ /install/i && $current_nodeset_status eq 'boot') {
notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, nodeset status changed: $previous_nodeset_status --> $current_nodeset_status");
insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_nodestat_status, $current_nodeset_status");
last MONITOR_LOADING;
}
}
else {
# nodestat will return 'sshd' if the computer is responding to SSH while it is being installed instead of the more detailed information
# Try to get the installation status directly using a socket
if ($current_nodestat_status eq 'sshd') {
$current_nodestat_status = $self->_get_install_status($computer_node_name) || 'sshd';
}
# Check if the installation has completed
if ($current_nodestat_status =~ /^(boot|complete)$/i || $current_nodeset_status =~ /^(boot)$/i) {
notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, current nodestat status: $current_nodestat_status, nodeset status: $current_nodeset_status");
insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_nodestat_status, $current_nodeset_status");
last MONITOR_LOADING;
}
}
# Check if the nodestat status changed from previous iteration
if ($current_nodestat_status ne $previous_nodestat_status || $current_nodeset_status ne $previous_nodeset_status) {
$reset_timeout = 1;
notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name changed");
# Set previous status to the current status
$previous_nodestat_status = $current_nodestat_status;
$previous_nodeset_status = $current_nodeset_status;
}
else {
notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name has not changed: $current_nodestat_status");
}
# If any changes were detected, reset the nochange timeout
if ($reset_timeout) {
$last_change_time = $current_time;
$nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
# Check how long ago reservation.lastcheck was updated
# Update it occasionally - used by parent reservation in cluster requests to detect that child reservations are still loading
# Updating reservation.lastcheck prevents the parent from timing out while waiting for children to finish loading
my $update_lastcheck_elapsed = ($current_time - $update_lastcheck_time);
if ($update_lastcheck_elapsed >= $update_lastcheck_interval_seconds) {
update_reservation_lastcheck($reservation_id);
$update_lastcheck_time = time;
}
}
#notify($ERRORS{'DEBUG'}, 0, "sleeping for $monitor_delay_seconds seconds");
sleep $monitor_delay_seconds;
}
$log->close;
# Check if timeout was reached
if ($current_time >= $nochange_timeout_time) {
notify($ERRORS{'WARNING'}, 0, "failed to load $image_name on $computer_node_name, timed out because no progress was detected for $nochange_timeout_seconds seconds, start of installation detected: " . ($install_started ? 'yes' : 'no'));
return;
}
elsif ($current_time >= $overall_timeout_time) {
notify($ERRORS{'CRITICAL'}, 0, "failed to load $image_name on $computer_node_name, timed out because loading took longer than $overall_timeout_minutes minutes, start of installation detected: " . ($install_started ? 'yes' : 'no'));
return;
}
# Call the OS module's post_load() subroutine if implemented
insertloadlog($reservation_id, $computer_id, "xcatround3", "initiating OS post-load configuration");
if ($self->os->can("post_load")) {
if ($self->os->post_load()) {
notify($ERRORS{'OK'}, 0, "performed OS post-load tasks on $computer_node_name");
}
else {
notify($ERRORS{'WARNING'}, 0, "failed to perform OS post-load tasks on VM $computer_node_name");
return;
}
}
else {
notify($ERRORS{'OK'}, 0, "OS post-load tasks not necessary on $computer_node_name");
}
return 1;
}