managementnode/lib/VCL/healthcheck.pm (404 lines of code) (raw):

#!/usr/bin/perl -w ############################################################################### # $Id$ ############################################################################### # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### =head1 NAME VCL::healthcheck =head1 SYNOPSIS use base qw(VCL::healthcheck); =head1 DESCRIPTION Needs to be written. =cut ############################################################################### package VCL::healthcheck; # Specify the lib path using FindBin use FindBin; use lib "$FindBin::Bin/.."; # Configure inheritance use base qw(); # Specify the version of this module our $VERSION = '2.5.1'; # Specify the version of Perl to use use 5.008000; use strict; use warnings; use diagnostics; use English qw(-no_match_vars); use VCL::utils; use VCL::DataStructure; #use VCL::Module::Provisioning::xCAT2; use DBI; ############################################################################### =head1 OBJECT METHODS =cut #////////////////////////////////////////////////////////////////////////////// #----------GLOBALS-------------- our $LOG = "/var/log/healthcheckvcl.log"; our $MYDBH; set_logfile_path($LOG); #////////////////////////////////////////////////////////////////////////////// =head2 new Parameters : Returns : Description : =cut sub new { my ($class, %input) = @_; my $obj_ref = {%input,}; bless $obj_ref, $class; # bless ref to said class $obj_ref->_initialize(); # more work to do return $obj_ref; } #////////////////////////////////////////////////////////////////////////////// =head2 _initialize Parameters : Returns : Description : =cut sub _initialize { my ($info) = @_; my ($mnid, $managementnodeid, $selh, @row, $rows, $mnresourceid, $resourceid); my $date_time = convert_to_datetime; notify($ERRORS{'OK'}, 0, "########### healthcheck run $date_time #################"); $info->{"globalmsg"}->{"header"} = "STATUS SUMMARY of VCL nodes:\n\n"; $info->{"logfile"} = $LOG; if ($info->{managementnode} = get_management_node_info()) { notify($ERRORS{'OK'}, 0, "retrieved management node information from database"); } else { notify($ERRORS{'WARNING'}, 0, "unable to retrieve management node information from database"); exit; } #2 Collect hash of computers I can control with data if ($info->{computertable} = get_computers_controlled_by_mn(%{$info->{managementnode}})) { notify($ERRORS{'OK'}, 0, "retrieved management node resource groups from database"); } else { notify($ERRORS{'WARNING'}, 0, "unable to retrieve management node resource groups from database"); exit; } } ### end sub _initialize #////////////////////////////////////////////////////////////////////////////// =head2 process Parameters : object Returns : Description : =cut sub process { my ($info, $powerdownstage) = @_; #notify($ERRORS{'OK'}, 0, "in processing routine"); $info->{"globalmsg"}->{"body"} = "Summary of VCL node monitoring system:\n\n"; my $mn_hostname = $info->{managementnode}->{hostname}; my $last_check; if ($powerdownstage =~ /^(available|all)$/) { notify($ERRORS{'WARNING'}, 0, "ALERT: powerdown stage triggered,placing MN $mn_hostname in maintenance"); if (set_managementnode_state($info->{managementnode}, "maintenance")) { notify($ERRORS{'OK'}, 0, "Successfully set $mn_hostname into maintenance"); } else { notify($ERRORS{'WARNING'}, 0, "Failed to set $mn_hostname into maintenance"); } } elsif ($powerdownstage =~ /^restore/) { notify($ERRORS{'WARNING'}, 0, "ALERT: Environment OK: restoring state of MN $mn_hostname in available"); if (set_managementnode_state($info->{managementnode}, "available")) { notify($ERRORS{'OK'}, 0, "Successfully set $mn_hostname into available"); } else { notify($ERRORS{'WARNING'}, 0, "Failed to set $mn_hostname into available"); } } else { #proceed standard checks } foreach my $cid (keys %{$info->{computertable}}) { #set some local variables #notify($ERRORS{'OK'}, 0, " dumping data for computer id $cid\n" . format_data($info->{computertable}->{$cid})); # Create a DataStructure object containing data for the computer specified as the argument my $data; my $self; my $computer_id = $cid; eval { $data= new VCL::DataStructure({computer_identifier => $computer_id}); }; if ($EVAL_ERROR) { notify($ERRORS{'WARNING'}, 0, "failed to create DataStructure object for computer ID: $computer_id, error: $EVAL_ERROR"); return; } elsif (!$data) { notify($ERRORS{'WARNING'}, 0, "failed to create DataStructure object for computer ID: $computer_id, DataStructure object is not defined"); return; } else { #notify($ERRORS{'OK'}, 0, "created DataStructure object for computer ID: $computer_id\n". format_data($data->get_request_data)); } my $computer_state = $data->get_computer_state_name(); $last_check = $data->get_computer_lastcheck_time(); my $computer_currentimage_name = $data->get_computer_currentimage_name(); #Only preform actions on available or failed computers #skip if is inuse, maintenance, tovmhost, etc. if ($computer_state !~ /available|failed/) { #notify($ERRORS{'OK'}, 0, "NODE computer_id $computer_id is in computer_state $computer_state skipping"); $info->{computers}->{$cid}->{"skip"} = 1; $info->{"computersskipped"} += 1; next; } #check lastcheck timestampe if (defined($last_check) && $computer_state !~ /failed/) { my $lastcheckepoch = convert_to_epoch_seconds($last_check); my $currentimeepoch = convert_to_epoch_seconds(); my $delta = ($currentimeepoch - $lastcheckepoch); my $delta_minutes = round($delta / 30); if ($delta_minutes <= (90)) { # notify($ERRORS{'OK'}, 0, "NODE $computer_id recently checked $delta_minutes minutes ago skipping"); #this node was recently checked $info->{computers}->{$cid}->{"skip"} = 1; $info->{"computersskipped"} += 1; next; } $info->{"computerschecked"} += 1; } ## end if (defined($last_check) && $computer_state !~... my $computer_hostname = $data->get_computer_host_name(); my $computer_short_name = $1 if ($computer_hostname =~ /([-_a-zA-Z0-9]*)(\.?)/); my $computer_type = $data->get_computer_type(); if ($computer_type eq "lab") { #next; $computer_short_name = $computer_hostname; } #next if ($computer_type eq "blade"); #next if ($computer_type eq "virtualmachine"); my %node_status; $node_status{"ping"} = 0; $node_status{"ssh"} = 0; $node_status{"ssh_status"} = "off"; $node_status{"status"} = "reload"; $node_status{"imagerevision_id"} = 0; $node_status{"currentimage"} = 0; $node_status{"vmstate"} = "off"; $node_status{"rpower"} = "off"; my $datestring; my $node_status_string = "reload"; notify($ERRORS{'OK'}, 0, "pinging node $computer_short_name "); if (_pingnode($computer_short_name)) { $node_status{ping} = 1; # Try nmap to see if any of the ssh ports are open before attempting to run a test command my $port_22_status = nmap_port($computer_short_name, 22) ? "open" : "closed"; my $port_24_status = nmap_port($computer_short_name, 24) ? "open" : "closed"; my $port = 22; if ($port_24_status eq "open") { $port = 24; } my $ssh_user= "root"; $ssh_user = "vclstaff" if ($computer_type eq "lab"); my ($exit_status, $output) = run_ssh_command({ node => $computer_short_name, command => "echo \"testing ssh on $computer_short_name\"", max_attempts => 2, output_level => 0, port => $port, user => $ssh_user, timeout_seconds => 30, }); my $sshd_status = "off"; # The exit status will be 0 if the command succeeded if (defined($output) && grep(/testing/, @$output)) { notify($ERRORS{'OK'}, 0, "ssh test: Successful"); $sshd_status = "on"; } else { notify($ERRORS{'OK'}, 0, "ssh test: failed. port 22: $port_22_status, port 24: $port_24_status"); } if ($sshd_status eq "on") { $node_status{"ssh"} = 1; if ($computer_type eq "lab") { $node_status_string = "ready"; $node_status{status} = "ready"; next; } my @currentimage_txt_contents = get_current_image_contents_no_data_structure($computer_short_name); foreach my $l (@currentimage_txt_contents) { #notify($ERRORS{'OK'}, 0, "NODE l=$l"); if ($l =~ /imagerevision_id/i) { chomp($l); my ($b,$imagerevision_id) = split(/=/,$l); $node_status{imagerevision_id} = $imagerevision_id; $node_status_string = "post_load"; $node_status{status} = "post_load"; } if ($l =~ /vcld_post_load/) { $node_status_string = "ready"; $node_status{status} = "ready"; } } if ($node_status{imagerevision_id}) { #Get image info using imagerevision_id as identifier my $image_info = get_imagerevision_info($node_status{imagerevision_id},0); $node_status{"currentimage"} = $image_info->{imagename}; $node_status{"current_image_id"} = $image_info->{imageid}; $node_status{"imagerevision_id"} = $image_info->{id}; $node_status{"vmstate"} = "on"; $node_status{"rpower"} = "on"; } } } #need to pass some of the management node info to provisioing module node_status $info->{computertable}->{$cid}->{"managementnode"} = $info->{managementnode}; $info->{computertable}->{$cid}->{"logfile"} = $info->{logfile}; notify($ERRORS{'OK'}, 0, "hostname:$computer_hostname cid:$cid type:$computer_type state:$computer_state"); notify($ERRORS{'OK'}, 0, "$computer_hostname currentimage:$node_status{currentimage} current_image_id:$node_status{current_image_id}"); notify($ERRORS{'OK'}, 0, "$computer_hostname imagerevision_id:$node_status{imagerevision_id}"); notify($ERRORS{'OK'}, 0, "$computer_hostname vmstate:$node_status{vmstate} power:$node_status{rpower} status:$node_status{status}"); # Collect current state of node - it could have changed since we started if (my $comp_current_state = get_computer_current_state_name($cid)) { $info->{computertable}->{$cid}->{computer}->{state}->{name} = $comp_current_state; $computer_state = $comp_current_state; } else { #could not get it, use existing data notify($ERRORS{'OK'}, 0, "could not retrieve current computer state cid= $cid, using old data"); } #check for powerdownstages if ($powerdownstage =~ /^(available|all)$/) { $info->{computertable}->{$cid}->{"powerdownstage"} = $powerdownstage; if (powerdown_event($info->{computertable}->{$cid})) { notify($ERRORS{'OK'}, 0, "Successfully powered down $computer_hostname"); } else { #notify($ERRORS{'OK'}, 0, "Could not powerdown $computer_hostname"); } next; } else { #proceed as normal } #count the nodes processed $info->{"computercount"} += 1; if ($node_status_string =~ /(^ready)|(post_load)/i) { #proceed notify($ERRORS{'OK'}, 0, "nodestatus reports $node_status_string for $computer_hostname"); #update lastcheck datetime $datestring = makedatestring; if (update_computer_lastcheck($computer_id, $datestring, 0)) { notify($ERRORS{'OK'}, 0, "updated lastcheckin for $computer_hostname"); } #udpate state to available if old state is failed if ($computer_state =~ /failed/i) { if (update_computer_state($computer_id, "available", 0)) { notify($ERRORS{'OK'}, 0, "updated state to available for $computer_hostname"); } } } ## end if ($node_status_string =~ /^ready/i) elsif ($node_status_string =~ /^reload/i) { $info->{computertable}->{$cid}->{node_status} = \%node_status; $info->{computertable}->{$cid}->{"computer_currentimage_name"} = $computer_currentimage_name; $info->{computertable}->{$cid}->{"computer_hostname"} = $computer_hostname; notify($ERRORS{'OK'}, 0, "nodestatus reports $node_status_string for $computer_hostname"); #additional steps my $node_available = 0; if ($computer_type eq "lab") { #no additional checks required for lab type #if (lab_investigator($info->{computertable}->{$cid})) { $node_available =1; #} } elsif ($computer_type eq "virtualmachine") { if (_virtualmachine_investigator($info->{computertable}->{$cid})) { $node_available = 1; } } elsif ($computer_type eq "blade") { if (_blade_investigator($info->{computertable}->{$cid})) { $node_available = 1; } } if ($node_available) { #update state to available if (update_computer_state($computer_id, "available", 0)) { notify($ERRORS{'OK'}, 0, "updated state to available for $computer_hostname"); } #update lastcheck datetime $datestring = makedatestring; if (update_computer_lastcheck($computer_id, $datestring, 0)) { notify($ERRORS{'OK'}, 0, "updated lastcheckin for $computer_hostname"); } } ## end if ($node_available) else { $info->{globalmsg}->{failedbody} .= "$computer_hostname type= $computer_type offline\n"; } } ## end elsif ($node_status_string =~ /^reload/i) [ if ($node_status_string =~ /^ready/i) else { notify($ERRORS{'OK'}, 0, "node_status reports unknown value for $computer_hostname node_status_string= $node_status_string "); } # sleep 3; } return 1; } ## end sub process #////////////////////////////////////////////////////////////////////////////// =head2 blade_investigator Parameters : hash Returns : 1,0 Description : provides additional checks for blade types =cut sub _blade_investigator { my ($self) = @_; my $retval = 0; my $computer_hostname = $self->{computer}->{hostname}; my $comp_imagename = $self->{computer_currentimage_name}; my $computer_id = $self->{computer_id}; my $nodestatus_status = $self->{node_status}->{status}; my $nodestatus_nodetype = $self->{node_status}->{nodetype}; my $nodestatus_currentimage = $self->{node_status}->{currentimage}; my $nodestatus_ping = $self->{node_status}->{ping}; my $nodestatus_rpower = $self->{node_status}->{rpower}; my $nodestatus_nodeset = $self->{node_status}->{nodeset}; my $nodestatus_ssh = $self->{node_status}->{ssh}; notify($ERRORS{'OK'}, 0, "computer_hostname= $computer_hostname node_status_status= $nodestatus_status"); #If can ping and can ssh into it, compare loaded image with database imagename if ($nodestatus_ping && $nodestatus_ssh) { if (_image_revision_check($computer_id, $comp_imagename, $nodestatus_currentimage)) { #return success notify($ERRORS{'OK'}, 0, "computer_hostname= $computer_hostname imagename updated"); $retval = 1; } } else { notify($ERRORS{'OK'}, 0, "computer_hostname= $computer_hostname is confirmed down"); } return $retval; } ## end sub _blade_investigator #////////////////////////////////////////////////////////////////////////////// =head2 powerdown_event Parameters : hash Returns : 1,0 Description : =cut sub powerdown_event { my ($self) = @_; my $management_node_keys = $self->{managementnode}->{keys}; my $computer_host_name = $self->{computer}->{hostname}; my $computer_short_name = 0; my $computer_ip_address = $self->{computer}->{IPaddress}; my $image_name = $self->{imagerevision}->{imagename}; my $image_os_type = $self->{image}->{OS}->{type}; my $provisioning_perl_package = $self->{computer}->{provisioning}->{module}->{perlpackage}; my $computer_type = $self->{computer}->{type}; my $computer_state = $self->{computer}->{state}->{name}; my $computer_node_name = $self->{computer}->{hostname}; my $power_down_stage = $self->{powerdownstage}; $computer_short_name = $1 if ($computer_node_name =~ /([-_a-zA-Z0-9]*)(\.?)/); #If blade or vm and available|failed|maintenance - simply power-off #If blade and vmhostinuse - check vms, if available power-down all if (($computer_type =~ /blade/) && ($computer_state =~ /^(available|failed|maintenance)/)) { notify($ERRORS{'OK'}, 0, "calling provision module $provisioning_perl_package power_off routine $computer_short_name"); eval "use $provisioning_perl_package"; if ($EVAL_ERROR) { notify($ERRORS{'WARNING'}, 0, "$provisioning_perl_package module could not be loaded"); notify($ERRORS{'OK'}, 0, "returning 0"); return 0; } my $power_off_status = eval "&$provisioning_perl_package" . '::power_off($computer_short_name);'; notify($ERRORS{'OK'}, 0, "$power_off_status "); if ($power_off_status) { notify($ERRORS{'OK'}, 0, "SUCCESS powered_off $computer_short_name"); return 1; } return 0; } else { notify($ERRORS{'OK'}, 0, "SKIPPING $computer_short_name computer_type= $computer_type in computer_state= $computer_state"); return 0; } } #////////////////////////////////////////////////////////////////////////////// =head2 virtualmachine_investigator Parameters : hash Returns : 1,0 Description : provides additional checks for virtualmachine types =cut sub _virtualmachine_investigator { my ($self) = @_; my $retval = 0; my $computer_hostname = $self->{computer}->{hostname}; my $comp_imagename = $self->{computer_currentimage_name}; my $computer_id = $self->{computer_id}; my $nodestatus_status = $self->{node_status}->{status}; my $nodestatus_currentimage = $self->{node_status}->{currentimage}; my $nodestatus_ping = $self->{node_status}->{ping}; my $nodestatus_ssh = $self->{node_status}->{ssh}; my $nodestatus_vmstate = $self->{node_status}->{vmstate}; my $nodestatus_image_match = $self->{node_status}->{image_match}; if ($nodestatus_vmstate =~ /off/) { # Ok for node to be off $retval =1; return $retval; } if ($nodestatus_currentimage && $nodestatus_ssh) { if (_image_revision_check($computer_id, $comp_imagename, $nodestatus_currentimage)) { #return success notify($ERRORS{'OK'}, 0, "computer_hostname= $computer_hostname imagename updated"); $retval = 1; } } else { notify($ERRORS{'OK'}, 0, "computer_hostname= $computer_hostname is confirmed down nodestatus_vmstate= $nodestatus_vmstate nodestatus_ssh= $nodestatus_ssh"); } return $retval; } ## end sub _virtualmachine_investigator #////////////////////////////////////////////////////////////////////////////// =head2 _image_revision_check Parameters : hash Returns : 1,0 Description : compare the input values, if no difference or success updated return 1, if can not update return 0 provides additional checks for virtualmachine types =cut sub _image_revision_check { my ($computer_id, $comp_imagename, $nodestatus_currentimage) = @_; my $retval = 1; #Return retval=1 only if update_computer_imagename fails if ($comp_imagename !~ /$nodestatus_currentimage/) { #update computer entry if (update_computer_imagename($computer_id, $nodestatus_currentimage, 0)) { notify($ERRORS{'OK'}, 0, "updated computer_id currentimage $nodestatus_currentimage"); $retval = 1; } else { #failed to update computer image info notify($ERRORS{'OK'}, 0, "update_computer_imagename return 0"); $retval = 0; } } ## end if ($comp_imagename !~ /$nodestatus_currentimage/) else { notify($ERRORS{'OK'}, 0, " image revisions match - no update required"); } return $retval; } ## end sub _image_revision_check #////////////////////////////////////////////////////////////////////////////// =head2 send_report Parameters : hash Returns : 1,0 Description : =cut sub send_report { my ($hck) = @_; my $management_node_info = get_management_node_info(); if (!$management_node_info) { notify($ERRORS{'WARNING'}, 0, "unable to send report, management node information could not be retrieved"); return; } my $sysadmin_email = $management_node_info->{SYSADMIN_EMAIL}; if (!$sysadmin_email) { notify($ERRORS{'WARNING'}, 0, "unable to send report, management node information does not contain a SYSADMIN_EMAIL value"); return; } #notify($ERRORS{'OK'},0,"$hck->{globalmsg}->{body}\n\n $hck->{globalmsg}->{failedbody}\n"); if (defined($hck->{computercount})) { $hck->{globalmsg}->{body} .= "Number of nodes found for this management node $hck->{MN}: $hck->{computercount}\n"; } if (defined($hck->{"computerschecked"})) { $hck->{globalmsg}->{body} .= "Number of nodes checked: $hck->{computerschecked}\n"; } if (defined($hck->{"computersskipped"})) { $hck->{globalmsg}->{body} .= "Number of nodes skipped due to recent check: $hck->{computersskipped}\n"; } if (defined($hck->{labnodesfailed})) { $hck->{globalmsg}->{body} .= "UNavailable labnodes: $hck->{labnodesfailed}\n"; } if (defined($hck->{labnodesavailable})) { $hck->{globalmsg}->{body} .= "Available labnodes: $hck->{labnodesavailable}\n"; } if (defined($hck->{globalmsg}->{correctedbody})) { $hck->{globalmsg}->{body} .= "\nCorrected VCL nodes:\n\n$hck->{globalmsg}->{correctedbody}\n"; } if (defined($hck->{globalmsg}->{failedbody})) { $hck->{"globalmsg"}->{body} .= "\nProblem VCL nodes:\n\n$hck->{globalmsg}->{failedbody}\n"; } if (!defined($hck->{globalmsg}->{failedbody}) && !defined($hck->{globalmsg}->{correctedbody})) { $hck->{globalmsg}->{body} .= "\nAll nodes report ok"; } mail($sysadmin_email, "VCL node monitoring report", "$hck->{globalmsg}->{body}"); } ## end sub send_report #////////////////////////////////////////////////////////////////////////////// 1; __END__ =head1 SEE ALSO L<http://cwiki.apache.org/VCL/> =cut