#!/usr/bin/python # Daniel Helgenberger, m box bewegtbild GmbH, 2013 # # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ################################################################################ __author__ = 'Daniel Helgenberger ' __version__ = '0.6' __plugin_name__ = 'check_iferror.py' import os import sys import time def parse_args(): # Build argument list try: import argparse except ImportError: print 'Error importing library python-argparse' gen_exitcode(exit_unkn) parser = argparse.ArgumentParser( prog=__plugin_name__, description='Nagios plugin, interned to check error count OIDs via SNMP. ' 'A non-zero exit code is generated, if the counter delta exceeds the waring / critical values in' 'the evaluation time period. Additionally the plugin checks for the interface state if arguments --ifoid ' 'and --ifupsate are supplied. The plugin will exit critical/warning if the the interface is down.' 'Recommendation: Use longer check_intervals with a low recheck count.', epilog='Currently only SNMPv1 and plain OIDs are supported. Needs python > 2.6, pysnmp > 4.2.5, python-argparse' 'This program is free software: you can redistribute it and/or modify ' 'it under the terms of the GNU General Public License as published by ' 'the Free Software Foundation, either version 3 of the License, or ' 'at your option) any later version. Author: ' + __author__) parser.add_argument('-H', '--host', help='SNMP Agent IP / hostname', default='localhost', required=True) parser.add_argument('-C', '--community', help='Read community name (default: "public")', default='public') parser.add_argument('-o', '--oid', help='Error count base OID', type=str, default='') parser.add_argument('-O', '--ifoid', help='Interface base OID', type=str, default='') parser.add_argument('-S', '--ifupstate', help='Expected interface operational state (integer)', type=int) parser.add_argument('--warnifdown', help='Exit with warning instead of critical if interface is down', action='store_true') parser.add_argument('--okifdown', help='Exit with ok instead of critical if interface is down', action='store_true') parser.add_argument('-i', '--interface', help='Interface number. Appended to the base OID', type=int, default=1) parser.add_argument('--preset', help='Use OID preset', type=str) parser.add_argument('-P', '--port', help='SNMP port', type=int, default=161) parser.add_argument('-w', '--warning', help='Warning threshold (Count)', type=int, default=1) parser.add_argument('-c', '--critical', help='Critical threshold (Count)', type=int, default=2) parser.add_argument('-t', '--time', help='Evaluation time period. (in hours, default 24h)', type=int, default=24) parser.add_argument('-s', '--scratch', help='Scratch / temp base directory. Must exist. (default: /tmp)', type=str, default='/tmp') parser.add_argument('-p', '--perfdata', help='Print performance data, (default: off)', action='store_true') parser.add_argument('-d', '--debug', help='Verbose mode', action='store_true') parser.add_argument('-T', '--test', help='Run test case; needs WAN connection', action='store_true') parser.add_argument('-V', '--version', action='version', version='%(prog)s '+__version__) return parser.parse_args() def get_time_threshold(offset_time): # calculate evaluation threshold time return int(time.time() + offset_time * epoch_multipl) def join_oid(*oids): joinedoid = '.' for _oid in oids: if type(_oid) == str: if _oid[0] == '.': if joinedoid[-1] == '.': joinedoid = joinedoid[:-1] + _oid else: joinedoid = joinedoid + _oid else: if joinedoid[-1] == '.': joinedoid = joinedoid + _oid else: joinedoid = joinedoid + '.' + _oid else: print 'join_oid: only sting arguments are supported!' gen_exitcode(exit_unkn) if joinedoid[-1] == '.': return joinedoid[:-1] else: return joinedoid def read_snmp(agent, community, port, oid): # Main SNMP get snmperrs = ["Null('')", "NoSuchObject('')", "NoSuchInstance('')", "OctetString('')"] integer_str = ["Integer", "Counter32"] hex_str = ["hexValue"] try: # try importing pysnmp and fail with error message if missing from pysnmp.entity.rfc3413.oneliner import cmdgen snmpread = cmdgen.CommandGenerator() try: errorind, errorstatus, errorindex, varbinds = snmpread.getCmd( cmdgen.CommunityData(community), cmdgen.UdpTransportTarget((agent, port)), oid) except: print 'SNMP exit error. Check agent and community.', str(varbinds), agent, community, port, oid sys.exit(exit_unkn) if any(word in str(varbinds) for word in snmperrs): print 'SNMP read error. Check agent and community.', str(varbinds), agent, community, port, oid sys.exit(exit_unkn) else: # Only Integer, Counter32 and hexValue are supported at the moment for name, val in varbinds: if any(word in str(varbinds) for word in integer_str): return int(val) elif any(word in str(varbinds) for word in hex_str): return int(val.asOctets().encode('hex'), 16) else: print 'SNMP value error, expected ', str(integer_str) + ', ' + str(hex_str) + '; got: ' + str(varbinds), agent, community, port, oid gen_exitcode(exit_unkn) except ImportError: print 'Library import error. Install pysnmp.' sys.exit(exit_unkn) def get_agent_file(_path_base, _agent, _interface): # returns cache file name return os.path.join(_path_base, 'nagios-ifheath-' + _agent + '_if_' + str(_interface) + '.cache') def read_offset(_file): # read offset from disk or return 0, -1 if there is no file (init case) try: tmpfile = open(_file, 'r') offset = int(tmpfile.readline()) tmpfile.close() # return offset and file's mtime to use with evaluation period return int(offset), int(os.stat(_file).st_mtime) except IOError: return 0, -1 def write_offset(_file, _offset): # write offset to disk for later use try: tmpfile = open(_file, 'w') tmpfile.write(str(_offset)) tmpfile.close() return True except IOError: print 'Error writing stat file' sys.exit(exit_unkn) def print_nagios(_level, _errorcount, _warn, _crit, _deltatime, _time, _ifstate, _perfdata): # print main nagios output nagdict = ('OK: Interface error rate normal', 'WARNING: Interface errors exceeding warning threshold', 'CRITICAL: Interface errors exceeding critical threshold', 'UNKNOWN: Something went wrong', 'CRITICAL: Interface down', 'WARNING: Interface down') if _perfdata: if _ifstate: print '%s: %s errors in the last %2.1f/%02d hour(s)|errors=%dc;%d;%d; state=%d' % \ (nagdict[_level], _errorcount, _deltatime, _time, _errorcount, _warn, _crit, _ifstate) else: print '%s: %s errors in the last %2.1f/%02d hour(s)|errors=%dc;%d;%d;' % \ (nagdict[_level], _errorcount, _deltatime, _time, _errorcount, _warn, _crit) else: print '%s: %s errors in the last %2.1f/%02d hour(s)' % (nagdict[_level], _errorcount, _deltatime, _time) return True def gen_exitcode(_exitcode): # exit method for nagios exit code sys.exit(_exitcode) # main if __name__ == '__main__': # some default values masterstopwatch = time.time() exit_ok = 0 exit_warn = 1 exit_crit = 2 exit_unkn = 3 epoch_multipl = 3600 ifstate = False EXCODE = False presets = { 'qlogicfc': { 'eprefix' : '.1.3.6.1.3.94.4.5.1.3.16.0.0.192.221', 'esuffix' : '.0.0.0.0.0.0.0.0', 'ifprefix': '.1.3.6.1.3.94.1.10.1.23.16.0.0.192.221', 'ifsuffix': '.0.0.0.0.0.0.0.0' }} # prepare debug / test case args = parse_args() if args.test: # setup test case print '' print 'Running test case on demo.snmplabs.com; using interface RX packet counter:' args.host = 'demo.snmplabs.com' args.oid = '.1.3.6.1.2.1.2.2.1.16' args.interface = 1 args.community = 'public' args.port = 161 args.debug = True args.time = 1 if args.preset: # Set up preset OIDs ... if presets.has_key(args.preset.lower()): args.ifoid = join_oid(presets[args.preset.lower()]['ifprefix'], args.oid, presets[args.preset]['ifsuffix'], str(args.interface)) args.oid = join_oid(presets[args.preset.lower()]['eprefix'], args.oid, presets[args.preset]['esuffix'], str(args.interface)) else: print 'Invalid preset. Valid presets are:' for key in presets: print ' ' + key gen_exitcode(exit_unkn) else: # ... or argument OIDs if args.ifoid: args.ifoid = join_oid(args.ifoid, str(args.interface)) args.oid = join_oid(args.oid, str(args.interface)) if args.debug: print '' print 'preset host: ', args.host print 'preset OID: ', args.oid print 'interface: ', args.interface print 'ifoid: ', args.ifoid print 'ifstate: ', args.ifupstate print 'using filename: ', get_agent_file(args.scratch, args.host, args.interface) print 'community ', args.community # Rewrite threshold for cache file ts_threshold = get_time_threshold(args.time) # read data from cache file, use filesystem mtime for unix epoch timestamp snmp_offset, file_timestamp = read_offset(get_agent_file(args.scratch, args.host, args.interface)) # run error count snmp check if args.debug: print 'Checking Interface error count...' stopwatch = time.time() snmp_errs = read_snmp(args.host, args.community, args.port, args.oid) if args.debug: stopwatch = float(time.time()) - float(stopwatch) print 'Got Interface error count: ', snmp_errs print ' Time: ', stopwatch, 's.' # Calculate deltas, counter and time. delta_hours is only used for print outs. delta_errs = snmp_errs - snmp_offset delta_time = int(time.time() - file_timestamp) delta_hours = float(delta_time) / float(epoch_multipl) if args.debug: print 'ts_threshold: ', ts_threshold print 'file_timestamp: ', file_timestamp print 'snmp_offset: ', snmp_offset print 'delta_errs: ', delta_errs print 'delta_time: ', delta_time print 'delta_hours: ', delta_hours print 'warning arg: ', args.warning print 'critical arg: ', args.critical #start main program logic if args.ifoid and args.ifupstate: # do the interface status check only if necessary if args.debug: print 'Running ifstate test...' stopwatch = time.time() ifstate = read_snmp(args.host, args.community, args.port, args.ifoid) if args.debug: stopwatch = float(time.time()) - float(stopwatch) print 'Got Interface error count: ', ifstate print ' Time: ', stopwatch, 's.' if ifstate != args.ifupstate: if args.debug: print 'Interface down, got ifstate: ', ifstate if args.warnifdown: # argument: --warnifdown print_nagios(5, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_warn elif args.okifdown: # argument: --okifdown EXCODE = exit_ok else: # normal exit behavior print_nagios(4, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_crit if args.community and args.host and args.oid and (not EXCODE or EXCODE == exit_ok): # Error count / evaluation. Should not be run when interface state error. if (delta_errs <= args.warning) or (file_timestamp == -1): # Nagios OK if file_timestamp == -1: # Fist run printout print 'Initial baseline, %i Errors.' % delta_errs else: print_nagios(0, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_ok elif delta_errs >= args.critical: # Nagios CRITICAL print_nagios(2, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_crit elif delta_errs >= args.warning & delta_errs < args.critical: # Nagios WARNING print_nagios(1, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_warn else: # Nagios UNKNOWN. This should never be the case print_nagios(3, delta_errs, args.warning, args.critical, delta_hours, args.time, ifstate, args.perfdata) EXCODE = exit_unkn if (delta_time > args.time * epoch_multipl) or file_timestamp == -1: # Always (re)write the cache when threshold is reached if args.debug: print 'Initialing / rotating tmp file' write_offset(get_agent_file(args.scratch, args.host, args.interface), snmp_errs) # Finally exit with the desired exit code for Nagios. if args.debug: print 'Finished. Total execution time: ', float(time.time()) - float(masterstopwatch), 's' gen_exitcode(EXCODE)