#!/usr/bin/python -tt """ Overview This is a rewrite, in python, of a shell script I found on the Nagios Exchange. I was looking for simple way to get the two most important disk statistics, IO and IOPS. IO is the amount of data being read and written to disk and IOPS is the number of reads and writes per second. The other checks that I have found had several drawbacks, 1) they are written in perl, require perl modules and (cover your ears...perl is antiquated and impossible to read language), 2) they provided too much disk information and 3) they used SNMP (not simple at all). While this check turned out to be much more complicated than the original shell script, I think it is still maintainable, does more error checking and is a bit more reliable parsing than shell. Here is the basic flow of the check. - Get the user arguments and put them into the user_args dict. - Check for the stats file containing the historical information, if it doesn't exist create a stubbed out one so the check can run. - Check that the device file exists, someone might have enter the name incorrectly. - Init the data array and add the contents of the stats file and the contents of the device file to it. - calculate the differences between the last set of data and the current one, add into the data array - update the stats file with the current data - send the status and performance data to nagios so it can alarm and create graphs. In terms of IOPS there are some benchmarks for random io on a single disk, it might be a starting place for thresholds. 7,200 rpm ==> 100 tps (sata) 15,000 rpm disk ==> 200 tps (sas) The io thresholds you might want to look at the data from the checks to see what is acceptable. Sanity Testing After I had tested the check using known data sets and checking the math, I did another sanity check against iostat. In two shells do the following and compare 10 second samples of data. You can do this for both io and iops. watch -n "./check_disk_io_stats -d drbd0 -s io -f drbd0_io_stats.txt -w 500 -c 1000 -v" iostat -m drbd0 10 Device File The source of data for this check is the /sys/block//stat file. It contains counters that are incremented and are reset on a reboot. If you have the difference between to samples of the stat data and a interval of time, you can calculate IO and IOPS. There are 11 integer fields in the /sys/block//stat file. They are statistics about a block device (not filesystems). Index Name Units Description ----- ---- ----- ----------- 00. read I/Os requests number of read I/Os processed <== USED IN IOPS CALC 01. read merges requests number of read I/Os merged with in-queue I/O 02. read sectors sectors number of sectors read <== USED IN IO CALC 03. read ticks milliseconds total wait time for read requests 04. write I/Os requests number of write I/Os processed <== USED IN IOPS CALC 05. write merges requests number of write I/Os merged with in-queue I/O 06. write sectors sectors number of sectors written <== USED IN IO CALC 07. write ticks milliseconds total wait time for write requests 08. in_flight requests number of I/Os currently in flight 09. io_ticks milliseconds total time this block device has been active 10. time_in_queue milliseconds total wait time for all requests Data Structures 1. There is a file that holds the data set from the last time the check ran. This historical data has the following format and contains only one line of either io or iops data. device|iops|tstamp|epoch|delta|read|write|total An actual file might look like this. drbd0|iops|Sep-17-2013 03:12 PM|1379455964|30|7349032|510927683|518276715 2. There are optional arguments for the script and are kept in a dict called user_args. user_args = {"device" : options.device, "stats_type" : options.stats_type, "stats_file" : options.stats_file, "warn" : options.warn, "critical" : options.critical, "verbose" : options.verbose } 3. There are three sets of data that are held in list called data. Each of the dicts contain the same keys, they just hold data from different points in time. data = [last, cur, delta] last - the data from the last time the check was run, this is the contents of the stats file described in 1. cur - the current set of data from /sys/block//stat file that will be used to update the stats file. delta - the difference between last and the current counters, this is the data that gets reported to nagios. Below is an example of the last{} in data[]. last = {"device" : "", # block device name "stats_type" : "", # either io or iops "tstamp" : "", # human readable timestamp "epoch" : "", # epoch timestamp "offset" : "", # diff between the current and last epoch "read" : "", # read counter "write" : "", # write counter "total" : "" # total counter, just read plus write } """ import sys import time import datetime import os.path from optparse import OptionParser ### ### Functions ### def parse_user_args(args): """ This function handles the parsing and validation of arguments passed from the command line, it uses optparse to do all the work. Arguments: args -- the array of command line arguments held in sys.argv Returns: user_args -- a dict of all the user args """ # Create an instance of OptionParser. # usage = "usage: %prog [options]" version = "%prog [ version 1.01 ]" summary = "Calculate disk io statistics based on /sys/block//stat." parser = OptionParser(usage=usage, version=version, description=summary) # Define options that you will see at the command line. # parser.add_option("-d", dest="device", action="store", help="device name listed under /sys/block") parser.add_option("-s", dest="stats_type", action="store", choices=["io", "iops"], help="type of disk statistic to check, valid values are io and iops") parser.add_option("-f", dest="stats_file", action="store", help="path to the file containing historical statistics") parser.add_option("-w", dest="warn", action="store", type="int", help="warn threshold") parser.add_option("-c", dest="critical", action="store", type="int", help="critical threshold") parser.add_option("-v", dest="verbose", action="store_true", help="print verbose output") # Parse the command line options, it will return two values. # (options, args) = parser.parse_args(args) # Check for required options. # if not options.device: print "ERROR: -d is a required option" sys.exit(3) if not options.stats_type: print "ERROR: -s is a required option" sys.exit(3) if not options.stats_file: print "ERROR: -f is a required option" sys.exit(3) if not options.warn: print "ERROR: -w is a required option" sys.exit(3) if not options.critical: print "ERROR: -c is a required option" sys.exit(3) # Build the dict that will get returned by this function. # user_args = {"device" : options.device, "stats_type" : options.stats_type, "stats_file" : options.stats_file, "warn" : options.warn, "critical" : options.critical, "verbose" : options.verbose} # Print verbose output if asked. # if options.verbose: print "[User Arguments]" for key, value in sorted(user_args.items()): print "%-25s\t%s" % (key, value) print "" # Finished, return the user args dict. # return user_args def print_to_nagios(status, output, perfdata="NA" ): """ This function formats the nagios data and submits the final check. Arguments: status -- string containing one of the 4 valid nagios statuses output -- string containing the nagios description perfdata -- an optional string containing the formatted performance data Returns: Prints the nagios status to stdout and exists with the appropriate exit status. """ # Check that the status that was passed in is a valid value. # status_codes = { 'OK' : 0 , 'WARN' : 1, 'CRITICAL' : 2 , 'UNKNOWN' : 3} try: exit_status = status_codes[status] except KeyError, err: print "ERROR: passed an invalid status to the print_to_nagios function [ status: %s ]" % (exit_status) sys.exit(3) # Format the final string and print to stdout with the appropriate exit status. Notice that perf data is optional. # if perfdata == "NA": print "%s - %s" % (status, output) sys.exit(exit_status) else: print "%s - %s|%s" % (status, output, perfdata) sys.exit(exit_status) def open_file(file, mode): """ This is a generic function to open a file. Arguments: file -- the path to of the file to open mode -- the mode to open the file with, typically r (read) or w (write) Returns: fh -- a file handle """ try: fh = open(file, mode) return fh except IOError, err: error = "ERROR: could not find stats file [ file: %s ]" % (file) print_to_nagios("UNKNOWN", error, "NA") def close_file(fh): """ This is a generic function to close a file. Arguments: fh - a file handle Returns: """ fh.close() def read_stats_file(fh): """ This function will take a string of data from the stats file and put it into an array. Arguments: fh - a file handle for the stats file Returns: fields -- an array containing all the fields in the stats file """ line = fh.readlines() fields = line[0].strip().split("|") return fields def parse_stats_file_line(line): """ This function will take an array containing the stats file data and put it into a dict. It will also check for the correct number of fields. Arguments: line - a single line from the stats file Returns: last -- a dict containing the stats file data """ expected_fields = 8 actual_fields = int(len(line)) if expected_fields != actual_fields: error = "ERROR: wrong number of fields in the stats file [ expected: %s ] [ actual: %s ]" % ( expected_fields, actual_fields) print_to_nagios("UNKNOWN", error, "NA") last = {"device" : line[0], "stats_type" : line[1], "tstamp" : line[2], "epoch" : line[3], "offset" : line[4], "read" : line[5], "write" : line[6], "total" : line[7]} return last def check_stats_file(args, file): """ Verify that the stats file exists, if it doesn't it might be because the script is being run for the first time, in that case create one with stubbed out stats. Arguments: args -- the dict containing the user args file -- the path to the stats file Returns: 0 = the stat file exists, 1 = the stats file doesn't exist and it will get created """ # Verify that the stats file exists, if it doesn't it might be because the script is being run for the # first time, in that case create one with stubbed out stats. # stub_data = {"device" : args["device"], "stats_type" : args["stats_type"], "tstamp" : datetime.datetime.now().strftime('%b-%d-%G %I:%M %p'), "epoch" : int(time.time()), "offset" : "0", "read" : "0", "write" : "0", "total" : "0"} if os.path.exists(file): return 0 else: update_stats_file(file, stub_data) return 1 def update_stats_file(file, data): """ This function will update the stats file with the latest data. The file gets truncated and the new line is written. Arguments: fh -- a file handle for the stats file with write permissions data -- a dict containing the new data Returns: """ fh = open_file(file, "w") line = "%s|%s|%s|%s|%s|%s|%s|%s\n" % (str(data['device']), str(data['stats_type']), str(data['tstamp']), str(data['epoch']), str(data['offset']), str(data['read']), str(data['write']), str(data['total'])) fh.seek(0) fh.truncate() fh.write(line) close_file(fh) def check_device_file(file): """ This function will check if the device file exists. Arguments: file -- the path to the device file Returns: 0 = the file exists, 1 = the file does not exist """ if os.path.exists(file): return 0 else: return 1 def read_device_file(fh): """ This function will take a string of data from the device file and put it into an array. Arguments: fh - a file handle for the device file Returns: fields -- an array containing all the fields in the device file """ line = fh.readlines() fields = line[0].strip().split() return fields def parse_device_file_line(args, line): """ This function will take an array containing the device file data and put it into a dict. It will also check for the correct number of fields. Arguments: args -- the dict containing the user args line - an array containing the device file data Returns: cur -- a dict containing the device file data """ expected_fields = 11 actual_fields = int(len(line)) if expected_fields != actual_fields: error = "ERROR: wrong number of fields in the device file [ expected: %s ] [ actual: %s ]" % ( expected_fields, actual_fields) print_to_nagios("UNKNOWN", error, "NA") if args["stats_type"] == "io": read = line[2] write = line[6] elif args["stats_type"] == "iops": read = line[0] write = line[4] cur = {"device" : args["device"], "stats_type" : args["stats_type"], "tstamp" : datetime.datetime.now().strftime('%b-%d-%G %I:%M %p'), "epoch" : int(time.time()), "offset" : "na", "read" : read, "write" : write, "total" : "na"} return cur def validate_data(args, data): """ This function will compare the device name and the stats type in all the data sets with what the user is asking for. It could be that the user specified the wrong stats file. Arguments: args -- the dict containing the user args data -- a dict containing current device data Returns: """ expected_device = args["device"] last_device = data[0]["device"] cur_device = data[1]["device"] if expected_device != last_device: error = "ERROR: device name mismatch between user args and stats file data [ expected: %s ] [ actual: %s ]" % (expected_device, last_device) print_to_nagios("UNKNOWN", error, "NA") if expected_device != cur_device: error = "ERROR: device name mismatch between user args and device file data [ expected: %s ] [ actual: %s ]" % (expected_device, cur_device) print_to_nagios("UNKNOWN", error, "NA") expected_type = args["stats_type"] last_type = data[0]["stats_type"] cur_type = data[1]["stats_type"] if expected_type != last_type: error = "ERROR: stats type mismatch between user args and stats file data [ expected: %s ] [ actual: %s ]" % (expected_type, last_type) print_to_nagios("UNKNOWN", error, "NA") if expected_type != cur_type: error = "ERROR: stats type mismatch between user args and device file data [ expected: %s ] [ actual: %s ]" % (expected_type, cur_type) print_to_nagios("UNKNOWN", error, "NA") def calculate_delta(args, data): """ This function will find the delta between the last and current data. Arguments: args -- the dict containing the user args data -- an array containing the last and current data dicts Returns: delta -- a dict with the differences between the last and current data """ # Find the time offset between the last stats and the current stats. # cur_epoch = data[1]["epoch"] last_epoch = data[0]["epoch"] offset = int(cur_epoch) - int(last_epoch) if offset < 0: error = "ERROR: time offset between current time and last time is less than zero, please check" print_to_nagios("UNKNOWN", error, "NA") # Calculate the new read and write data. # cur_read = data[1]["read"] last_read = data[0]["read"] read = int(cur_read) - int(last_read) cur_write = data[1]["write"] last_write = data[0]["write"] write = int(cur_write) - int(last_write) total = read + write # Read and write can't be a negative number, if they are, the machine was probably rebooted so set value to zero # and write the new values to the stats file and it should be OK next time the script runs. # if read < 0: read = 0 if write < 0: write = 0 delta = {"device" : args["device"], "stats_type" : args["stats_type"], "tstamp" : datetime.datetime.now().strftime('%b-%d-%G %I:%M %p'), "epoch" : int(time.time()), "offset" : offset, "read" : read, "write" : write, "total" : total} return delta ### ### MAIN ### ### STEP ZERO: user args # Parse the user arguments and get back a dict with all the key/value pairs. # user_args = parse_user_args(sys.argv) ### STEP ONE: make sure the stats file and device file are available # If the stats file doesn't exist, then it will get created and populated with some bogus data. # stats_file = user_args["stats_file"] check_stats_file(user_args, stats_file) # If the device file doesn't exist, that is a show stopper. # device_file = "/sys/block/" + user_args["device"] + "/stat" if check_device_file(device_file) == 1: error = "ERROR: could not find device file [ file: %s ]" % (device_file) print_to_nagios("UNKNOWN", error, "NA") ### STEP TWO: collect the last and current data # This is the array of dicts that will hold the last, current and delta data. # data = [] # Create a dict containing the data from the stats file and put it into the data array. # fh = open_file(stats_file, "r") stats_file_line = read_stats_file(fh) close_file(fh) last = parse_stats_file_line(stats_file_line) data.append(last) if user_args["verbose"]: print "[Stats File]" for key, value in last.items(): print "%-25s\t%s" % (key, value) print "" # Get the current stats from the device file. # fh = open_file(device_file, "r") device_file_line = read_device_file(fh) close_file(fh) current = parse_device_file_line(user_args, device_file_line) data.append(current) if user_args["verbose"]: print "[Device File]" for key, value in current.items(): print "%-25s\t%s" % (key, value) print "" ### STEP THREE: calculate the difference between the last stats and the current stata validate_data(user_args, data) delta = calculate_delta(user_args, data) data.append(delta) if user_args["verbose"]: print "[Delta]" for key, value in delta.items(): print "%-25s\t%s" % (key, value) print "" ### STEP FOUR:update the stats file # Update stats file with the new data. # update_stats_file(stats_file, data[1]) ### STEP FIVE: send alarm to nagios # Format the performance data that will get sent to nagios. # # stats read = str(data[2]['read']) write = str(data[2]['write']) total = str(data[2]['total']) # thresholds warn = int(user_args["warn"]) critical = int(user_args["critical"]) # misc device = user_args["device"] stats_type = user_args["stats_type"] offset = str(data[2]['offset']) # IOPS are transactions per second, so divide them by the time offset. # if user_args["stats_type"] == "iops": try: read_tps = int(read) / int(offset) write_tps = int(write) / int(offset) total_tps = int(total) / int(offset) except ZeroDivisionError, err: error = "ERROR: time offset is zero, can't calculate the iops. Maybe you are running this script for the first time." print_to_nagios("UNKNOWN", error, "NA") perf_data = "readiops=%s, writeiops=%s, totaliops=%s" % (read_tps, write_tps, total_tps) output = "device: %s, interval: %ssec, read(tps)/write(tps)/total(tps): %s/%s/%s warn/critical: %s/%s" % (device, offset, read_tps, write_tps, total_tps, str(warn), str(critical)) if int(total_tps) > critical: print_to_nagios("CRITICAL", output, perf_data) elif int(total_tps) > warn: print_to_nagios("WARN", output, perf_data) else: print_to_nagios("OK", output, perf_data) # IO is in 512 byte sectors, they should be converted to mb so that it is more human friendly. # if user_args["stats_type"] == "io": read_mb = ((int(read) * 512) / 1024) / 1024 write_mb = ((int(write) * 512) / 1024) / 1024 total_mb = ((int(total) * 512) / 1024) / 1024 perf_data = "readio=%s, writeio=%s, totalio=%s" % (str(read_mb), str(write_mb), str(total_mb)) output = "device: %s, interval: %ssec, read(mb)/write(mb)/total(mb): %s/%s/%s warn/critical: %s/%s" % (device, offset, str(read_mb), str(write_mb), str(total_mb), str(warn), str(critical)) if int(total_mb) > critical: print_to_nagios("CRITICAL", output, perf_data) elif int(total_mb) > warn: print_to_nagios("WARN", output, perf_data) else: print_to_nagios("OK", output, perf_data)