#!/bin/bash #==================================================================================== #:TITLE : check_megasasctl #:TYPE : Nagios Plugin Script #:AUTHOR : r0dr1gu3z (jrod@automattic.com) #:COMPANY : Automattic, Inc. #:VERSION : 0.0.1 #:CREATED : May 13, 2010 #:DESCRIPTION : This script checks the local output of megasasctl under /tmp/raidstatus.txt for # the status of each raid array, drive (online status and Media Error Counts), and BBU status. #:PARMS : N/A #:OPTIONS : N/A #:NOTES : Assoc. w/: command[check_megasasctl]=/usr/local/nagios/plugins/check_megasasctl #:ROLE ASSOC. : base #==================================================================================== #==================================================================================== # Variable Defs: #==================================================================================== SCRIPT=${0##*/} NORMALRAIDSTATUS="optimal" NORMALDISKSTATUS="online" NORMALBBUSTATUS="good" CURRENTRAIDSTATUS="" CURRENTDISKSTATUS="" BADRAIDARRAYS=() BADDISKARRAY=() BADBBUARRAY=() STATUS="OK" MSG="" MEDIATHRESHOLD=20 MAXMEDIATHRESHOLD=30 HIGH_MEDIAERRORLIST=() MED_MEDIAERRORLIST=() MEGASASCTL="/usr/local/bin/megasasctl" TEMPRAIDSTATUS="/tmp/raidstatus.txt" #==================================================================================== # Function Defs: #==================================================================================== # Function to Check Raid Arrays function check_raid_array_status() { # Get list of RAID Arrays RAIDLIST=$( grep -v "bios" $TEMPRAIDSTATUS | grep "RAID" | grep '.' | sed 's/ */\ /g' | cut -d' ' -f1); # Next go through raid array list and check each one for its status for i in $RAIDLIST; do # Log list of degraded raid arrays CURRENTRAIDSTATUS=$( grep "${i} " $TEMPRAIDSTATUS | grep "RAID" | sed 's/ */\ /g'| cut -d' ' -f6 ); # If a RAID Array is found to be be less than optimal then record it. if [ $CURRENTRAIDSTATUS != $NORMALRAIDSTATUS ] ; then BADRAIDARRAYS=( "${BADRAIDARRAYS[@]}" "${i}" ) fi done # Check if the number of degraded arrays is greater than 0 if [ ${#BADRAIDARRAYS[@]} != 0 ] ; then # Set Status = Critical STATUS="CRITICAL" # Report which arrays degraded arrays were found MSG="Degraded RAID ARRAY(s) found: ${BADRAIDARRAYS[@]}" else MSG="RAID ARRAYS OPTIMAL, " fi } # Function to Check All Disk Online Status - check all disks that do not have an status of "online" function check_all_disk_online_status() { # Get list of Disks DISKLIST=$( grep -v "bios\|RAID\|row" $TEMPRAIDSTATUS | grep '.' | sed 's/ */\ /g'| cut -d' ' -f1 ); # Next go through raid array list and check each one for their status for i in $DISKLIST; do # Log list of degraded disks CURRENTDISKSTATUS=$( grep "${i} " $TEMPRAIDSTATUS | grep -v "row" | sed 's/ */\ /g'| cut -d' ' -f6 ); if [ $CURRENTDISKSTATUS != $NORMALDISKSTATUS ] ; then BADDISKARRAY=( "${BADDISKARRAY[@]}" "${i}" ) fi done # Set status information if [ ${#BADDISKARRAY[@]} != 0 ] ; then # Set Status = Critical STATUS="CRITICAL" # Report which arrays degraded disks were found MSG="Degraded Disks found: ${BADDISKARRAY[@]}" else MSG="${MSG} ALL DISKS ONLINE, " fi } # Function to check media error counts (M.E.C's) across all disks function check_all_media_error_status() { # Get list of Disks DISKLIST=$( grep -v "row\|bios\|RAID" $TEMPRAIDSTATUS | grep "errs:" | sed 's/ */\ /g' | grep '.' | cut -d' ' -f1 ); # Next go through all disks and group by M.E.C for i in $DISKLIST; do DISKMEDIAERRORCOUNT=$( grep "$i " $TEMPRAIDSTATUS | grep -v "row" | sed 's/ */\ /g' | cut -d: -f3 | cut -d' ' -f1 ) # Check to see if M.E.C's are within thresholds # if the number M.E.C is greater then the max threshold record it if [ $DISKMEDIAERRORCOUNT -gt $MAXMEDIATHRESHOLD ] ; then HIGH_MEDIAERRORLIST=( "${HIGH_MEDIAERRORLIST[@]}" "$i:${DISKMEDIAERRORCOUNT}" ) # else check if to see if M.E.C's are within the allowed thresholds and record it. elif [ $DISKMEDIAERRORCOUNT -gt $MEDIATHRESHOLD ] && [ $DISKMEDIAERRORCOUNT -lt $MAXMEDIATHRESHOLD ] ; then MED_MEDIAERRORLIST=( "${MED_MEDIAERRORLIST[@]}" "$i:${DISKMEDIAERRORCOUNT}" ) fi done # Set status information if [ ${#HIGH_MEDIAERRORLIST[@]} != 0 ] ; then # Set Status = CRITICAL STATUS="CRITICAL" # Report which disks were found w/ high MEC MSG="High M.E.C. on: ${HIGH_MEDIAERRORLIST[@]}" elif [ ${#MED_MEDIAERRORLIST[@]} != 0 ] ; then # Set Status = WARNING STATUS="WARNING" # Report which disks were found w/ med MEC MSG="M.E.C. Warning on the following disks: ${MED_MEDIAERRORLIST[@]}" else MSG="${MSG} ALL DISKS OPTIMAL, " fi } # Function to check the BBU status across all adapters function check_adapt_bbu_status() { # Get Adapter Count ADAPTCOUNT=$( grep -c "bios" $TEMPRAIDSTATUS ) # Go through each Adapter and log bad BBUs for ((i=0; i<$ADAPTCOUNT; i++)) ; do # Grab the BBU status for each adaptoer BBUSTATUS=$( grep "a${i} " $TEMPRAIDSTATUS | sed 's/ */\ /g' | cut -d: -f7 | cut -d/ -f1 ); if [ "$BBUSTATUS" != "$NORMALBBUSTATUS" ] ; then BADBBUARRAY=( "${BADBBUARRAY[@]}" "a$i" ) fi done # Check check to list of BBU's to see if the number is greater than 0 if [ ${#BADBBUARRAY[@]} != 0 ] ; then # Set Status = Warning STATUS="WARNING" # report which bad BBU's were found MSG="Bad BBU(s) found: ${BADBBUARRAY[@]}" else MSG="${MSG} ALL BBU(s) OK" fi } # Function to check status as we go function check_status() { if [ $STATUS = "CRITICAL" ] ; then echo "$STATUS: $MSG" exit 2 elif [ $STATUS = "WARNING" ] ; then echo "$STATUS: $MSG" exit 1 fi } #==================================================================================== # MAIN: #==================================================================================== # First, capture megasasctl tmp file ${MEGASASCTL} -v > $TEMPRAIDSTATUS if [ ! $? -eq 0 ] && [ ! -e ${TEMPRAIDSTATUS} ] ; then echo "UNKONWN: Error running ${SCRIPT}. Please check"; exit 2; fi # Check the the status of the RAID Array check_raid_array_status check_status # Check the status of each disk on every array, check_all_disk_online_status check_status # Check for Media Errors check_all_media_error_status check_status # Check BBU status across all adapters check_adapt_bbu_status check_status # Cleanup: remove tmp raidstatus.txt file rm -rf $TEMPRAIDSTATUS # Check final script status if [ $STATUS = "OK" ] ; then echo "$STATUS: $MSG" exit 0 fi