#!/usr/bin/env python # # Copyright Toby Sears 2012 # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # """ Nagios plugin to check the overall health of a system. Uses SSH to connect to remote server, requires that passwordless login is enabled (SSH Key authentication). Checks the following: - 15 minute load (warning or critical) - Swap useage (warning or critical) - All locally mounted partitions useage % (warning or critical) - RAM useage % (warn only) - CPU useage % (just a "nice to know", no warnings) Tested on Ubuntu 11.10, 10.04 and Debian 6 running Python 2.7.2+ Note: if you're getting an error about unable to change to float, your hostname may be wrong. """ try: import commands, string, sys, subprocess, getopt except: print "UNKNOWN: import failed." print "Tested on Python 2.7.2, please make sure you have this version or newer" sys.exit(3) # # Main function # - Define global variables # - Set default variables # - Retrieve user input # - Run through functions in correct order # def main(): # Define global variables global exit_message exit_message = "" global username global hostname # Default variables #---------------------------------- username = "root" load_warning = 8.00 load_critical = 10.00 disk_warning = float(80) disk_critical = float(95) ram_warning = float(95) swap_warning = 102400 swap_critical = 204800 #---------------------------------- # Check user input try: opts, args = getopt.getopt(sys.argv[1:], "hH:U:l:L:d:D:r:s:S:", ["help", "hostname=", "username=", "loadwarn=", "loadcrit=", "diskwarn=", "diskcrit=", "ramwarn=", "swapwarn=", "swapcrit=" ]) except getopt.GetoptError, err: print str(err) useage() for opt, arg in opts: if opt in ("-h", "--help"): useage() elif opt in ("-H", "--hostname"): hostname = arg elif opt in ("-U", "--username"): username = arg elif opt in ("-l", "--loadwarn"): load_warning = float(arg) elif opt in ("-L", "--loadcrit"): load_critical = float(arg) elif opt in ("-d", "--diskwarn"): disk_warning = float(arg) elif opt in ("-D", "--diskcrit"): disk_critical = float(arg) elif opt in ("-r", "--ramwarn"): ram_warning = float(arg) elif opt in ("-s", "--swapwarn"): swap_warning = int(arg) elif opt in ("-S", "--swapcrit"): swap_critical = int(arg) else: print "Unexpected option" useage() #Check for hostname variable try: if type(hostname) is str: pass except: print "============================\nHostname is required\n============================" useage() # Check all the required resource levels, # assign their error codes to a variable load_code = check_load(load_warning, load_critical) ram_code = check_ram(ram_warning) check_cpu() swap_code = check_swap(swap_warning, swap_critical) disk_code = check_disk(disk_warning, disk_critical) # Get those values checked exit_code = code_check(load_code, ram_code, disk_code, swap_code) # And finish the script end(exit_message, exit_code) # # GET Functions - SSH into the remote server and run the command, returns the result # def get_load(): command = "ssh -q " + str(username) + "@" + str(hostname) + " cat /proc/loadavg | awk '{print $3}'" command = str(command) ret = commands.getstatusoutput(command) load = float(ret[1]) return load def get_ram(): command = "ssh -q " + str(username) + "@" + str(hostname) + " ps aux|awk 'NR > 0 { s +=$4 }; END {print s}'" command = str(command) ret = commands.getstatusoutput(command) ram = float(ret[1]) return ram def get_cpu(): command = "ssh -q " + str(username) + "@" + str(hostname) + " ps aux|awk 'NR > 0 { s +=$3 }; END {print s}'" command = str(command) ret = commands.getstatusoutput(command) cpu = ret[1] return cpu def get_swap(): command = "ssh -q " + str(username) + "@" + str(hostname) + " cat /proc/swaps | tail -n1 | awk '{print $4}'" command = str(command) ret = commands.getstatusoutput(command) swap = int(ret[1]) return swap # # CHECK functions # - Check the returned values against either default or user inputted values # - Return the appropriate error code # - Edit the exit_message string appropriately # #CPU Check (Note, this is only a nice to know, so returns no error code) def check_cpu(): global exit_message cpu = get_cpu() exit_message = (str(exit_message) \ + " CPU: " + str(cpu) + "% \t") #Ram check def check_ram(ram_warn): global exit_message ram = get_ram() if ram >= ram_warn: exit_message = (str(exit_message) \ + "WARNING: RAM: " + str(ram) + "% \t") return 1 else: exit_message = (str(exit_message) + "RAM: " + str(ram) + "% \t") return 0 #Load check def check_load(load_warn, load_crit): global exit_message load = get_load() if load >= load_warn and load >= load_crit: exit_message = (str(exit_message) \ + "CRITICAL: Load: " + str(load) + " \t") return 2 elif load >= load_warn: exit_message = (str(exit_message) \ + "WARNING: Load: " + str(load) + " \t") return 1 else: exit_message = (str(exit_message) + "Load: " +str(load) + " \t") return 0 #Swap check def check_swap(swap_warn, swap_crit): global exit_message swap_used = get_swap() if swap_used >= swap_warn and swap_used >= swap_crit: exit_message = (str(exit_message) \ + "CRITICAL: SWAP over threshold (" \ + str(swap_used) + "kb) \t") return 2 elif swap_used >= swap_warn: exit_message = (str(exit_message) \ + "WARNING: SWAP being used but under critical threshold (" \ + str(swap_used) + "kb) \t") return 1 else: return 0 # # Drive check function # - SSH into remote system and retrieve the list of current filesystems # - Split returned data into appropriate values and check # def check_disk(disk_warn, disk_crit): global exit_message disk_info = "" disk_code = 0 num = 0 command = "ssh -q " + str(username) + "@" + str(hostname) + " df -Pl | sed -e 1d | awk '{print ($5,$6)}'" command = str(command) disk_output = commands.getstatusoutput(command) for item in disk_output[1::]: disk_info += str(item) disk_info = string.split(disk_info) perc = disk_info[::2] mount_points = disk_info[1::2] percentages = [] for item in perc: item_length = len(item) -1 item = item[:item_length] percentages.append(item) try: for item in percentages: item = int(item) if item >= disk_warn and item >= disk_crit: exit_message = (str(exit_message) \ + "CRITICAL: Mount point " \ + str(mount_points[num]) \ + " is at " + str(item) + "% \t") disk_code = 2 elif item >= disk_warn: exit_message = (str(exit_message) \ + "WARNING: Mount point " \ + str(mount_points[num]) \ + " is at " + str(item) + "% \t") if disk_code < 1: disk_code = 1 num += 1 except: exit_message = (str(exit_message) + " UNKNOWN: Drive Check Failed \t") disk_code = 3 return disk_code # Function to check error codes and return a single value def code_check(load, ram, disk, swap): if (load == 2) or (ram == 2) or (disk == 2) or (swap == 2): return 2 elif (load == 1) or (ram == 1) or (disk == 1) or (swap == 1): return 1 else: return 0 def useage(): useage = """ Nagios system check script. Options: -h or --help : print this message and exit. -H or --hostname : the hostname of the remote system -U or --username : the username to use -l or --loadwarn : the load warning level (x.xx) -L or --loadcrit : the load critical level (x.xx) -d or --diskwarn : the disk warning level % (xx.x) -D or --diskcrit : the disk critical level % (x.x) -r or --ramwarn : the ram warning level % (x.x) -s or --swapwarn : the swap warning level (in kb) -S or --swapcrit : the swap critical level (in kb) Hostname MUST be defined. Other defaults are: username root load warning 8.00 load critical 10.00 disk warning 80% disk critical 95% ram warning 95% swap warning 10240kb swap critical 20480kb cpu % monitoring is just a nice to know and will not trigger alarms. """ print useage sys.exit(0) # End the script and return both message and error code def end(message, exit_code): print message sys.exit(exit_code) if __name__ == "__main__": try: main() except KeyboardInterrupt: print "Control-C pressed...exiting" sys.exit(2)