sub load()

in managementnode/lib/VCL/Module/Provisioning/xCAT.pm [181:454]


sub load {
	my $self = shift;
	if (ref($self) !~ /xCAT/i) {
		notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
		return;
	}

	# Get the data
	my $reservation_id             = $self->data->get_reservation_id();
	my $image_name                 = $self->data->get_image_name();
	my $image_reload_time_minutes  = $self->data->get_image_reload_time() || 10;
	my $computer_id                = $self->data->get_computer_id();
	my $computer_node_name         = $self->data->get_computer_node_name();
	my $management_node_hostname   = $self->data->get_management_node_hostname();
	
	insertloadlog($reservation_id, $computer_id, "startload", "$computer_node_name $image_name");
	
	# Insert a computerloadlog record and edit nodetype table to set the image information for the computer
	insertloadlog($reservation_id, $computer_id, "editnodetype", "updating nodetype table");
	$self->_edit_nodetype($computer_node_name, $image_name) || return;
	
	# Insert a computerloadlog record and edit nodelist table to set the xCAT groups for the computer
	$self->_edit_nodelist($computer_node_name, $image_name) || return;
	
	# Check to see if management node throttle is configured
	my $throttle_limit = get_variable("xcat|throttle|$management_node_hostname", 0) || get_variable("$management_node_hostname|xcat|throttle", 0) || get_variable("xcat|throttle", 0);
	if (!$throttle_limit || $throttle_limit !~ /^\d+$/) {
		$throttle_limit = 10;
		notify($ERRORS{'DEBUG'}, 0, "xCAT load throttle limit variable is NOT set in database: 'xcat|throttle', using default value: $throttle_limit");
	}
	else {
		notify($ERRORS{'DEBUG'}, 0, "xCAT load throttle limit variable is set in database: $throttle_limit");
	}
	
	my $throttle_limit_wait_seconds = (30 * 60);
	if (!$self->code_loop_timeout(sub{!$self->_is_throttle_limit_reached(@_)}, [$throttle_limit], 'checking throttle limit', $throttle_limit_wait_seconds, 1, 10)) {
		notify($ERRORS{'WARNING'}, 0, "failed to load image due to throttle limit, waited $throttle_limit_wait_seconds seconds");
		return;
	}
	
	# Set the computer to install on next boot
	$self->_nodeset($computer_node_name, 'install') || return;
	
	# Restart the node
	$self->power_reset($computer_node_name) || return;
	
	# Run lsdef to retrieve the node's configuration including its MAC address
	my $node_info = $self->_lsdef($computer_node_name);
	if (!$node_info) {
		notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, failed to retrieve node info");
		return;
	}
	my $mac_address = $node_info->{mac};
	if ($mac_address) {
		notify($ERRORS{'DEBUG'}, 0, "retrieved MAC address of $computer_node_name: $mac_address");
	}
	else {
		notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, node info does not contain the MAC address:\n" . format_data($node_info));
		return;
	}
	
	# nodeset changes xCAT state to 'install'
	# node is power cycled or powered on (nodeset/nodestat status: install/noping)
	# Wait for node to boot from network (may take from 30 seconds to several minutes if node is using UEFI)
	# In /var/log/messages:, node makes DHCP request & requests PXE boot information from DHCP server running on management node:
	#   Apr  1 09:36:39 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:36:39 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:36:43 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:36:43 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
	#
	# Node requests PXE boot files from TFTP server running on management node:
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2070
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2071
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/xx-xx-xx-xx-xx-xx to 10.yy.yy.yy:57089
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/0A0A0132 to 10.yy.yy.yy:57090
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/vmlinuz to 10.yy.yy.yy:57091
	#   Apr  1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/initrd.img to 10.yy.yy.yy:57092
	#
	# Node boots using files downloaded from TFTP/PXE server, makes another DHCP request:
	#   Apr  1 09:37:15 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:37:15 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:37:15 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
	#   Apr  1 09:37:15 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
	# OS installation begins (nodeset/nodestat status: install/installing prep)
	# If Kickstart, Linux packages are installed (nodestat status: 'installing <package> (x%)')
	# If Kickstart, postscripts are installed (nodestat status: 'installing post scripts')
	# When installation is complete, xCAT status is changed to 'boot' and node is restarted (nodeset/nodestat status: boot/noping)
	# Node boots from hard drive (nodeset/nodestat status: boot/boot)
	
	# Open the /var/log/messages file for reading
	my $messages_file_path = '/var/log/messages';
	my $log = IO::File->new($messages_file_path, "r");
	if (!$log) {
		my $error = $! || 'none';
		notify($ERRORS{'WARNING'}, 0, "failed to open $messages_file_path for reading, error: $error");
		return;
	}
	# Go to the end of the messages file
	if (!$log->seek(0, SEEK_END)) {
		my $error = $! || 'none';
		notify($ERRORS{'CRITICAL'}, 0, "failed to seek end of $messages_file_path, error: $error");
	}

	insertloadlog($reservation_id, $computer_id, "xcatstage5", "loading image $image_name");
	
	if ($image_reload_time_minutes < 10) {
		$image_reload_time_minutes = 10;
	}
	my $nochange_timeout_seconds = ($image_reload_time_minutes * 60);
	
	my $monitor_start_time = time;
	my $last_change_time = $monitor_start_time;
	my $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
	
	# Sanity check, timeout the load monitoring after a set amount of time
	# This is done in case there is an endless loop which causes the node status to change over and over again
	# Overall timeout is the lesser of 60 minutes or 2x image reload time
	my $overall_timeout_minutes;
	if ($image_reload_time_minutes < 30) {
		$overall_timeout_minutes = 60;
	}
	else {
		$overall_timeout_minutes = ($image_reload_time_minutes * 2);
	}
	my $overall_timeout_time = ($monitor_start_time + $overall_timeout_minutes * 60);
	
	# Number of seconds to wait between checks
	# Set to a short delay at the beginning of monitoring, this will be increased once installation start is detected
	my $monitor_delay_seconds = 5;
	
	# Keep track of when reservation.lastcheck was last updated
	my $update_lastcheck_interval_seconds = 60;
	my $update_lastcheck_time = time;
	update_reservation_lastcheck($reservation_id);
	
	my $previous_nodestat_status;
	my $previous_nodeset_status;
	my $current_time;
	my $install_started = 0;
	my $dhcp_ack = 0;
	MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
		my $total_elapsed_seconds = ($current_time - $monitor_start_time);
		my $nochange_elapsed_seconds = ($current_time - $last_change_time);
		my $nochange_remaining_seconds = ($nochange_timeout_time - $current_time);
		my $overall_remaining_seconds = ($overall_timeout_time - $current_time);
		notify($ERRORS{'DEBUG'}, 0, "monitoring $image_name loading on $computer_node_name\n" .
			"seconds since monitor start/until unconditional timeout: $total_elapsed_seconds/$overall_remaining_seconds\n" .
			"seconds since last change/until no change timeout: $nochange_elapsed_seconds/$nochange_remaining_seconds"
		);
		
		# Flag to set if anything changes
		my $reset_timeout = 0;
		
		# Check if any lines have shown in in /var/log/messages for the node
		my @lines = $log->getlines;
		my @dhcp_lines = grep(/dhcpd:.+DHCP.+\s$mac_address\s/i, @lines);
		if (@dhcp_lines) {
			if (grep(/DHCPREQUEST/i, @dhcp_lines)) {
				insertloadlog($reservation_id, $computer_id, "xcatstage1", "requested DHCP lease");
			}
			
			if (my ($dhcpack_line) = grep(/DHCPACK/i, @dhcp_lines)) {
				notify($ERRORS{'DEBUG'}, 0, "$computer_node_name acquired DHCP lease: '$dhcpack_line'");
				if (!$dhcp_ack) {
					insertloadlog($reservation_id, $computer_id, "xcatstage2", "acquired DHCP lease");
					insertloadlog($reservation_id, $computer_id, "xcatround2", "waiting for boot flag");
					$dhcp_ack=1;
				}
			}
			
			$reset_timeout = 1;
			notify($ERRORS{'DEBUG'}, 0, "DHCP activity detected in $messages_file_path:\n" . join("\n", @dhcp_lines));
		}
		
		# Get the current status of the node
		# Set previous status to current status if this is the first iteration
		my $current_nodestat_status = $self->_nodestat($computer_node_name);
		$previous_nodestat_status = $current_nodestat_status if !defined($previous_nodestat_status);
		
		my $current_nodeset_status = $self->_nodeset($computer_node_name, 'stat');
		$previous_nodeset_status = $current_nodeset_status if !defined($previous_nodeset_status);
		
		if (!$install_started) {
			# Check if the installation has started
			if ($current_nodestat_status =~ /(install|partimage)/i) {
				# Slow down the monitor looping
				$monitor_delay_seconds = 20;
				notify($ERRORS{'DEBUG'}, 0, "installation has started, increasing wait between monitoring checks to $monitor_delay_seconds seconds");
				$install_started = 1;
			}
			
			# If installation start was missed, nodeset will go from install to boot
			if ($previous_nodeset_status =~ /install/i && $current_nodeset_status eq 'boot') {
				notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, nodeset status changed: $previous_nodeset_status --> $current_nodeset_status");
				insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_nodestat_status, $current_nodeset_status");
				last MONITOR_LOADING;
			}
		}
		else {
			# nodestat will return 'sshd' if the computer is responding to SSH while it is being installed instead of the more detailed information
			# Try to get the installation status directly using a socket
			if ($current_nodestat_status eq 'sshd') {
				$current_nodestat_status = $self->_get_install_status($computer_node_name) || 'sshd';
			}
			
			# Check if the installation has completed
			if ($current_nodestat_status =~ /^(boot|complete)$/i || $current_nodeset_status =~ /^(boot)$/i) {
				notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, current nodestat status: $current_nodestat_status, nodeset status: $current_nodeset_status");
				insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_nodestat_status, $current_nodeset_status");
				last MONITOR_LOADING;
			}
		}
		
		# Check if the nodestat status changed from previous iteration
		if ($current_nodestat_status ne $previous_nodestat_status || $current_nodeset_status ne $previous_nodeset_status) {
			$reset_timeout = 1;
			notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name changed");
			
			# Set previous status to the current status
			$previous_nodestat_status = $current_nodestat_status;
			$previous_nodeset_status = $current_nodeset_status;
		}
		else {
			notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name has not changed: $current_nodestat_status");
		}
		
		# If any changes were detected, reset the nochange timeout
		if ($reset_timeout) {
			$last_change_time = $current_time;
			$nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
			
			# Check how long ago reservation.lastcheck was updated
			# Update it occasionally - used by parent reservation in cluster requests to detect that child reservations are still loading
			# Updating reservation.lastcheck prevents the parent from timing out while waiting for children to finish loading
			my $update_lastcheck_elapsed = ($current_time - $update_lastcheck_time);
			if ($update_lastcheck_elapsed >= $update_lastcheck_interval_seconds) {
				update_reservation_lastcheck($reservation_id);
				$update_lastcheck_time = time;
			}
		}
		
		#notify($ERRORS{'DEBUG'}, 0, "sleeping for $monitor_delay_seconds seconds");
		sleep $monitor_delay_seconds;
	}
	
	$log->close;
	
	# Check if timeout was reached
	if ($current_time >= $nochange_timeout_time) {
		notify($ERRORS{'WARNING'}, 0, "failed to load $image_name on $computer_node_name, timed out because no progress was detected for $nochange_timeout_seconds seconds, start of installation detected: " . ($install_started ? 'yes' : 'no'));
		return;
	}
	elsif ($current_time >= $overall_timeout_time) {
		notify($ERRORS{'CRITICAL'}, 0, "failed to load $image_name on $computer_node_name, timed out because loading took longer than $overall_timeout_minutes minutes, start of installation detected: " . ($install_started ? 'yes' : 'no'));
		return;
	}
	
	# Call the OS module's post_load() subroutine if implemented
	insertloadlog($reservation_id, $computer_id, "xcatround3", "initiating OS post-load configuration");
	if ($self->os->can("post_load")) {
		if ($self->os->post_load()) {
			notify($ERRORS{'OK'}, 0,  "performed OS post-load tasks on $computer_node_name");
		}
		else {
			notify($ERRORS{'WARNING'}, 0, "failed to perform OS post-load tasks on VM $computer_node_name");
			return;
		}
	}
	else {
		notify($ERRORS{'OK'}, 0, "OS post-load tasks not necessary on $computer_node_name");
	}
	
	return 1;
}