#!/usr/bin/perl =head1 check_nrpe_multiplexor Written by Martin Houston This command runs a specified nrpe service check on a number of hosts and takes a check to run on all of them. If ANY return OK then this test is OK with extra status of which hosts are in each result code group for this service. It is not even worth a warning, just informational as the normal state may be that only one instance should be 'hot' at any one time anyway. Only if there are no OK examples is the return code of the test as a whole either WARNING or CRITICAL. Again if we have a service in WARNING state that wins over CRITICAL because we may still have a viable service. Note that this is the exact opposite of the way that escalation usually works with one element of a test failing bumping the failure state for it all. We also have a conditional and feature with a second nrpe command/arg set that is applied to any systems in the OK state. This allows tests to be constructed that cause a second tests to only be attempted e.g. if a particular service is running. =cut my %RETCODES = ('OK' => 0, 'WARNING' => 1, 'CRITICAL' => 2, 'UNKNOWN' => 3); use strict; use Getopt::Long; sub help { print "Usage:check_nrpe_multiplexor -H host (can be multiple) -c command -t timeout -a argstring [-h]\n\n"; print "Options :\n"; print "-H, --host should be repeated -H host1 -H host2 (or this is pointless) - or one -H and a comma separated list of hosts is allowed\n"; print "-c, --command to send to remote nrpe\n"; print "-t, --timeout to use\n"; print "-a, --args to pass to remote nrpe (same for all hosts)\n"; print "-C, --candcommand to send to remote nrpe for where the main command works\n"; print "-A, --candargs to pass to remote nrpe with --candcommand\n"; print "-h, --help - print this help screen.\n"; print "\nExample : check_nrpe_multiplexor -H host1 -H host2 -c check_procs -t 3 -a 'some args'\n"; print "Fails if only give one host, returns OK if ANY host is OK\n"; exit(1); } my(@hosts,$command, $args, $timeout, $candcommand, $candargs); $timeout = 5; # default if not set GetOptions("help|h" => \&help, "host|H=s"=> \@hosts, "command|c=s" => \$command, "timeout|t=i" => \$timeout, "args|a=s" => \$args, "candcommand|C=s" => \$candcommand, "candargs|A=s"=> \$candargs); # do a first pass over hosts spliceing in any comma separated sets my @oldhosts = @hosts; @hosts = (); for(@oldhosts) { push @hosts, split(/,/); } help() unless scalar(@hosts) > 1 && $command; # is an array not a hash because we know we only have to deal with exit codes 0 to 3 my @result_matrix = ([],[],[],[]); for my $host (@hosts) { my $res = qx{/usr/local/nagios/libexec/check_nrpe -H $host -n -t $timeout -c $command -a \"$args\"}; # we store by result (exit code of the nrpe) my $ec = $?; if($ec <0 or $ec > 3) { # map codes given by windows nrpe $ec=($ec == 512 ? 2 : 3); } # index direct by exit code as that is what we are concentrating on finding. push @{$result_matrix[$ec]},[$host, $res]; # keeping what was said here although not using it for anything at present, working just on nrpe exit code } my $resstring = "$command $args "; my $main_result = "UNDEFINED"; my $goodhosts = ''; if(scalar @{$result_matrix[0]} > 0) { # we have at least one OK result $main_result = 'OK' if $main_result eq 'UNDEFINED'; $resstring .= 'OK=' . ($goodhosts = join(',', map { $_->[0] } @{$result_matrix[0]})) . ' '; } if(scalar @{$result_matrix[1]} > 0) { # we have at least one WARNING result $main_result = 'WARNING' if $main_result eq 'UNDEFINED'; $resstring .= " WARNING=" . join(',', map { $_->[0] } @{$result_matrix[1]}) . ' '; } if (scalar @{$result_matrix[2]} > 0) { # we have at least one CRITICAL result $main_result = 'CRITICAL' if $main_result eq 'UNDEFINED'; $resstring .= " CRITICAL=" . join(',', map { $_->[0] } @{$result_matrix[2]}) . ' '; } if (scalar @{$result_matrix[3]} > 0) { # we have at least one UNDEFINED result $resstring .= " UNDEFINED=" . join(',', map { $_->[0] } @{$result_matrix[3]}) . ' '; } if(defined $candcommand && $goodhosts ne '') { my @secondary_result_matrix = ([],[],[],[]); # we have a second layer command to try - mostly repeat $resstring = "$candcommand $candargs "; for my $host(split(/,/,$goodhosts)) { my $res = qx{/usr/local/nagios/libexec/check_nrpe -H $host -n -t $timeout -c $candcommand -a \"$candargs\"}; # we store by result (exit code of the nrpe) my $ec = $?; if($ec <0 or $ec > 3) { # map codes given by windows nrpe $ec=($ec == 512 ? 2 : 3); } # index direct by exit code as that is what we are concentrating on finding. push @{$secondary_result_matrix[$ec]},[$host, $res]; # keeping what was said here } if(scalar @{$secondary_result_matrix[0]} > 0) { # we have at least one OK result $main_result = 'OK' if $main_result eq 'UNDEFINED'; $resstring .= 'OK=' . join(',', map { $_->[1] } @{$secondary_result_matrix[0]}) . ' '; } if(scalar @{$secondary_result_matrix[1]} > 0) { # we have at least one WARNING result $main_result = 'WARNING' if $main_result eq 'UNDEFINED'; $resstring .= " WARNING=" . join(',', map { $_->[1] } @{$secondary_result_matrix[1]}) . ' '; } if (scalar @{$secondary_result_matrix[2]} > 0) { # we have at least one CRITICAL result $main_result = 'CRITICAL' if $main_result eq 'UNDEFINED'; $resstring .= " CRITICAL=" . join(',', map { $_->[1] } @{$secondary_result_matrix[2]}) . ' '; } if (scalar @{$secondary_result_matrix[3]} > 0) { # we have at least one UNDEFINED result $resstring .= " UNDEFINED=" . join(',', map { $_->[1] } @{$secondary_result_matrix[3]}) . ' '; } } # for an extra point could provide stats info in the form hostname=0|1|2|3 so we can trend which host is the working one, but that would get very difficult for a 2 stage conditional! # $goodhosts = "Running on $goodhosts" if $goodhosts ne ''; print "$main_result - $goodhosts $resstring\n"; exit($RETCODES{$main_result});