#!/usr/bin/python # check_ipmi.py for ipmi querries in Nagios # Copyright (C) 2010 Garrett McGrath # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # This script will allow you to querry the status of one or more sensors connected to # an IPMI board. It utilizes and requires the FreeIPMI package as it utilizes the 'ipmi-sensors' command. # To utilize this command you will need to have a sensor map for each type of hardware you wish to monitor. # This sensor map can be retrieved by running the command: "impi-sensors -h -u -p " # This will dump a list similar to the following into your console: # (this is a partial sensor list for a Sunfire X2200) # 640: CPU 0 Temp (Temperature): 40.00 C (NA/95.00): [OK] # 704: CPU 1 Temp (Temperature): 41.00 C (NA/95.00): [OK] # 768: Ambient Temp0 (Temperature): 36.00 C (NA/75.00): [OK] # 832: Ambient Temp1 (Temperature): 41.00 C (NA/75.00): [OK] # 1632: CPU0 DIMM0 (Memory): [OK] # 1680: CPU0 DIMM1 (Memory): [OK] # 1728: CPU0 DIMM2 (Memory): [OK] # 2400: POST Error (System Firmware): [Unknown] # 2448: Eventlog (Event Logging Disabled): [OK] # 2496: System Event (System Event): [OK] # 2544: Critical INT (Critical Interrupt): [OK] # 2592: Watchdog (Watchdog 2): [OK] # Usage: # check_ipmi -n -u -p -s # You can include the optional arguement '-e' with a comma seperated list of expected values. (blanks default to looking for OK) # This is useful for instances where 'OK' is not the typical return statement, as nagios has no way to translate but will happily take error codes. # import sys from optparse import OptionParser import os, sys from commands import getoutput ### Main Procedure ### # This is the main execution path, this is a procedural program, it runs, it quits, no loop. def main(argv): #build usage/help pane usage = "usage: %prog -n -u -p -s [options]" version = "1.0" prog = "check_ipmi" parser = OptionParser(prog=prog, usage=usage, version=version) # can't use -h for hostname, optparse detects it as the help command. parser.add_option('-n', '--hostname', dest='hostname', action="store", default='127.0.0.1', type='string', help='IP or Hostname of target machine, assumes %default if none is provided.') # IPMI username and password parser.add_option('-u', '--username', dest='username', action="store", default='ADMIN', type='string', help='Username of IPMI Interface.') parser.add_option('-p', '--password', dest='password', action="store", default='ADMIN', type='string', help='Password of IPMI Interface.') # list of comma seperated sensor ID numbers parser.add_option('-s', '--sensors', dest='sensor_ids', action="store", default='', type='string', help='Comma seperated list of sensor ID values') # list of comma seperated strings, parser.add_option('-e', '--expected', dest='expected_output', action="store", default = [], type='string', help='(optional) Comma seperated list of expected results for none standard sensor returns\n ie. ([Processor Presence detected],[Processor Presence detected],,[Unknown]). \n Empty spaces will default to "[OK]". Must be one entry for each sensor if any are provided.') #turn on debug mode parser.add_option('-d', '--debug', dest = 'debug', action='store_true', default = False, help = 'Turn on debug output') # parse the arguements or argv options,args= parser.parse_args(argv) global debug debug = options.debug if options.sensor_ids == '': print "Must include a list of sensors you'd like to use" return 1 sensor_ids = options.sensor_ids sensor_ids = sensor_ids.split(',') # convert string into list sensor_ids = map(int,sensor_ids) # cast strings into integers. # capture expected_output, or generate if not provided. expected_output = options.expected_output if expected_output: expected_output = expected_output.split(',') # split expected string into list of strings. else: for sensor in sensor_ids: expected_output.append('') # assume all comparisons against '[OK]', generate list of '' entries if len(sensor_ids) != len(expected_output): #if there is a mismatch between the number of sensor id's and expected results error out. print "You must provide a value or blank for each sensor requested. Alternatively leave -e out." return 2 #build ipmi-sensors command sensor_command = 'ipmi-sensors -h ' + options.hostname + ' -u ' + options.username + ' -p ' + options.password + ' -s ' + options.sensor_ids if debug: #run ipmi-sensors request print "command run: " + sensor_command #os.system(sensor_command) # test run. sensor_output = getoutput(sensor_command) sensor_output = sensor_output.splitlines()#returns an empty set if there is no output. if debug: print "sensor_ouput: " print sensor_output sensor_output_error, message = check_sensor_output(sensor_output) #if this check fails, error out if sensor_output_error: if debug: print "output_error: " + message print message return 2 #return an error #generate a tuple structure that can be sorted and manipulated sensor_struct = [] for index in range(len(sensor_ids)): sensor_struct.append((sensor_ids[index],expected_output[index])) #sort afore mentioned structure by id sensor_struct.sort() return_value, nagios_feed = process_output(sensor_output, sensor_struct) for index in range(len(nagios_feed)): print nagios_feed[index] if debug: print "return: " print return_value return return_value def check_sensor_output(sensor_output): # this does a few quick checks to cause the system to error out if there are big problems. if sensor_output == []: return True, "no sensor output recieved" elif sensor_output[0].find('ipmi-sensors:') != -1 or sensor_output[0].find('ipmi_open_outofband:') != -1: return True, sensor_output[0] else: return False, '' #### Process_output #### # This function processes the sensor output, and confirms that for each sensor requested, one was returned. def process_output(sensor_output, sensor_struct): # Setup defaults. return_val = 0 warning = False error = False nagios_feed = [] for index in range(len(sensor_struct)): # split the sensor id and expected output sensor_id, expected_output = sensor_struct[index] # the sensor ID should always be compared against the item in location 0, if they ID's match, it will be removed from the list. sensor = sensor_output[0] sensor = sensor.split(':') #if the current sensor id doesn't equal the id of output at this index, generate a warning. if sensor == [] or sensor_id != int(sensor[0].strip()): #use short circuit logic to prevent addressing of an empty array nagios_feed.append("sensor " + sensor_id + " has no return value") #no performance info for an error :P warning = true #set warning flag else: sensor_output.pop(0) # clean up sensor_output, so it's ready for the next step in the loop. sensor_name = remove_IPMI_units(sensor[1]) #strip junk info like 'celsius' and 'fan' sensor_status = 0 sensor_performance = '' # check for the availablity of performance data. if len(sensor) == 4: sensor_performance = generate_nagios_perfdata(sensor[2]) sensor_status = check_sensor_status(sensor[3],expected_output) if debug: print 'Perfdata available for ' + sensor_name + ' (' + sensor_performance + ')' else: #if no perf data available, just grab the sensor status. sensor_performance = "" sensor_status = check_sensor_status(sensor[2],expected_output) if debug: print 'Perfdata not available for ' + sensor_name # set warning or error flags if they are relevant. if sensor_status == 1: #flag the warning on. warning = True elif sensor_status == 2: #flag the error on. error = True nagios_feed.append(build_feed_string(sensor_name, sensor_status, sensor_performance)) # assemble output string if warning: return_val = 1 if error: return_val = 2 return return_val, nagios_feed #### build feed string #### # Simple builds a string based on the sensor name, status and available performance data. def build_feed_string(sensor_name, sensor_status, sensor_performance): feed = "" if sensor_status == 0: feed = sensor_name + " is OK" elif sensor_status == 1: feed = sensor_name + " has generated a WARNING" else: feed = sensor_name + " has generated an ERROR" if sensor_performance != "": feed += " | " + sensor_performance return feed #### remove ipmi units #### # strips out junk 'units' in the device name string. def remove_IPMI_units(device_string): x = device_string.find("(") if x == -1: device_name = device_string.strip() else: device_name = device_string[0:x] device_name = device_name.strip() return device_name #### check sensor status against expected #### # adjust for empty's by testing for 'OK' def check_sensor_status(sensor_status, expected_status): sensor_status = sensor_status.strip() #multiple paths to this point, so strip here. if expected_status == '': if debug: print sensor_status + " : [OK]" if sensor_status == '[OK]': return 0 # sensor status ok elif sensor_status == '[NA]': return 1 # generate warning, unknown status else: return 2 # sensor status error else: expected_status = expected_status.strip() if debug: print sensor_status + " : " + expected_status if sensor_status == expected_status: return 0 # sensor status ok else: return 2 # sensor status error #### generate nagios performance data #### # turn any avaiable IPMI performance information into a format better suited to nagios # layout as: cur val;min val;max val from: 41.00 C (NA/75.00) type format def generate_nagios_perfdata(raw_perf_data): raw_perf_data = raw_perf_data.strip() if debug: print 'raw perfdat: ' + raw_perf_data perfdata = raw_perf_data.split(' ') perf_value = perfdata[0] perf_value = perf_value.strip() perf_range = perfdata[len(perfdata)-1] perf_range = perf_range.strip('(,)') perf_range = perf_range.split('/') performance = perf_value + ';' + perf_range[0] + ';' + perf_range[1] if debug: print 'performance: ' + performance return performance #program 'main' primer. if __name__ == "__main__": sys.exit(main(sys.argv[1:])) #this causes the return of the main function to be the return value of the script.