#!/bin/bash ######################################################### # # # HP/Compaq Insight Management Agents Checker # # # # check_insight_agents # # Version 2.5 (June 9th, 2016) # # # # Authored by Jason Leonard # # E-mail: jason_leonard@yahoo.com # # # # Overview # # ----------------------------------------------------- # # This plugin started out as individual rewrites # # of the check_hp plugins - 6 scripts that each provide # # information on different HP component statuses. I # # found myself quite a fan of the plugin's output, and # # was using them to learn SNMP and Nagios scripting. # # Yet in the plugins, I saw where the code used many # # if/then conditions, where case/select statements # # could provide a more efficient analysis and reduce # # redundancy in the code. # # # # As things developped, I had a strong desire to # # find/use 1 plugin that could report all information. # # A few were already developped, but as I tested them, # # they each seemed to lack output that was meaningful. # # In addition, a more amateur programmer myself, I # # found myself struggling with the logic of other code. # # So I set to try and combine the 6 individual plugins # # into 1. This is the result of that work. # # # # The basic flow of the plugin is that it checks # # global health first, and reports on that condition # # most of the time. This leaves the program to only # # make 1 check on most iterations. Individual compon- # # ents are checked only when the global status might # # indicate something wrong. At this point, although # # chances are only 1 item has failed, the plugin makes # # its best attempt to assume nothing and give info. on # # ANY component that has failed. This also differs from # # the check_hp plugin, which tended to output each # # components status as checked, whether failed or not. # # # # I readily admit that the plugins have had # # limited testing in our own environment, but we did # # have a horrible old Compaq server that I was able to # # use to test a number of scenarios. It's multiple # # hardware failures actually encouraged me to make sure # # the plugin could report more than 1 failed component. # # # # Hopefully, as this plugin is used, I can get # # more feedback on where I may have missed something # # in my logic - I took what seems to be a very unique # # approach in checking "global" health before checking # # any other conditions. # # # # As with every piece of code I write, I made a # # strong effort for the code to be easy to follow, and # # employed meaningful variable names to help clarify # # whatever might be going on in the code. # # # # This plugin is distributed under the GNU GPL license. # # You may re-destribute only according to the terms of # # the GNU GPL v2. # # # ######################################################### ######################################################### ## GLOBAL VARIABLES ## ######################################################### APPNAME=$(basename $0) AUTHOR="Jason Leonard" VERSION="2.5" # Default settings for connection COMMUNITY='public' HOST_NAME='localhost' SNMPVERSION='2c' # State Variables STATE_OK=0 STATE_WARN=1 STATE_CRIT=2 STATE_UNK=3 # Default Outputs STATE=$STATE_OK STATE_STRING="" PERFDATA="" # Change this to modify the script's handling of how it # separates each consumable/tray when multiple checks # are output. # SEPARATOR="\n" SEPARATOR=',' # This is the character that tokenizes multiple arguments # to the port check. I have this here so it's easy to # change if I find the current character I have is buggy # or a bad choice ARG_TOKEN=',' ######################################################### # Universal SNMP OIDS # ######################################################### OID_cpqSiSysServiceNum="1.3.6.1.4.1.232.2.2.2.5" OID_cpqSiProductName="1.3.6.1.4.1.232.2.2.4.2" OID_cpqDaCntlrHwLocation="1.3.6.1.4.1.232.3.2.2.1.1.20" OID_cpqDaCntlrCondition="1.3.6.1.4.1.232.3.2.2.1.1.6" OID_cpqDaLogDrvCondition="1.3.6.1.4.1.232.3.2.3.1.1.11" OID_cpqDaLogDrvOsName="1.3.6.1.4.1.232.3.2.3.1.1.14" OID_cpqDaLogDrvStatus="1.3.6.1.4.1.232.3.2.3.1.1.4" OID_cpqDaPhyDrvModel="1.3.6.1.4.1.232.3.2.5.1.1.3" OID_cpqDaPhyDrvCondition="1.3.6.1.4.1.232.3.2.5.1.1.37" OID_cpqDaPhyDrvStatus="1.3.6.1.4.1.232.3.2.5.1.1.6" OID_cpqDaPhyDrvLocationString="1.3.6.1.4.1.232.3.2.5.1.1.64" OID_cpqHeMibCondition="1.3.6.1.4.1.232.6.1.3" OID_cpqHeCorrMemLogCondition="1.3.6.1.4.1.232.6.2.3.2" OID_cpqHeCorrMemTotalErrs="1.3.6.1.4.1.232.6.2.3.3" OID_cpqHeCorrMemErrorCntThresh="1.3.6.1.4.1.232.6.2.3.5" OID_cpqHeThermalCondition="1.3.6.1.4.1.232.6.2.6.1" OID_cpqHeThermalDegradedAction="1.3.6.1.4.1.232.6.2.6.2" OID_cpqHeThermalTempStatus="1.3.6.1.4.1.232.6.2.6.3" OID_cpqHeThermalSystemFanStatus="1.3.6.1.4.1.232.6.2.6.4" OID_cpqHeFltTolFanHotPlug="1.3.6.1.4.1.232.6.2.6.7.1.10" OID_cpqHeFltTolFanLocale="1.3.6.1.4.1.232.6.2.6.7.1.3" OID_cpqHeFltTolFanRedundant="1.3.6.1.4.1.232.6.2.6.7.1.7" OID_cpqHeFltTolFanCondition="1.3.6.1.4.1.232.6.2.6.7.1.9" OID_cpqHeTemperatureLocale="1.3.6.1.4.1.232.6.2.6.8.1.3" OID_cpqHeTemperatureCelsius="1.3.6.1.4.1.232.6.2.6.8.1.4" OID_cpqHeTemperatureThreshold="1.3.6.1.4.1.232.6.2.6.8.1.5" OID_cpqHeTemperatureCondition="1.3.6.1.4.1.232.6.2.6.8.1.6" OID_cpqHeFltTolPwrSupplyCondition="1.3.6.1.4.1.232.6.2.9.1" OID_cpqHeFltTolPowerSupplyCondition="1.3.6.1.4.1.232.6.2.9.3.1.4" OID_cpqHeFltTolPowerSupplyStatus="1.3.6.1.4.1.232.6.2.9.3.1.5" OID_cpqHeFltTolPowerSupplyRedundant="1.3.6.1.4.1.232.6.2.9.3.1.9" ######################################################### # print_version # ######################################################### print_version() { echo "$APPNAME $VERSION" echo "$AUTHOR" echo "" } ######################################################### # print_usage # ######################################################### print_usage(){ echo "" echo "Usage for SNMP v1/2c:" echo " $APPNAME -H [-C ]" echo "" echo "Usage for SNMP v3:" echo " $APPNAME -H -u -x -X -a -A -l " echo "" } ######################################################### ## print_help Function ## ######################################################### # Prints out user help and gives examples of proper # # plugin usage # ######################################################### function print_help () { print_version echo "Description:" echo "$APPNAME is a Nagios plugin to check the status of various components of Dell PowerConnect switches." echo "" echo "This plugin is not developped by the Nagios Plugin group." echo "Please do not e-mail them for support on this plugin." echo "" echo "For contact info, please read the plugin script file." print_usage echo "---------------------------------------------------------------------" echo "" echo "OPTIONS:" echo " -H|--host" echo " Host name or IP address to check. Default is: localhost. REQUIRED OPTION" echo " -v|--snmpversion { 1 | 2c | 3 }" echo " Specifies the SNMP version to use. Default is '2c'. REQUIRED OPTION" echo " -C|--community" echo " SNMP v2 community string with Read access. Default is 'public'. REQUIRED OPTION" echo " -u|--user" echo " SNMP v3 username" echo " -l|--privlevel { noAuthNoPriv | authNoPriv | authPriv }" echo " SNMP v3 privilege level" echo " -x|--privprotocol { DES | AES }" echo " SNMP v3 privacy protocol" echo " -X|--privpassword" echo " SNMP v3 privacy password" echo " -a|--authprotocol { MD5 | SHA }" echo " SNMP v3 authentication protocol" echo " -A|--authpassword" echo " SNMP v3 authentication password" echo " -h|--help" echo " Show this help screen" echo " -V|--version" echo " Show the current version of the plugin" echo "" echo "Example:" echo " $APPNAME -H 10.0.1.10 -C public" echo "" echo "---------------------------------------------------------------------" exit $STATE_UNK } ######################################################### ## ThermalChecks function ## ######################################################### # Checks all thermal components - fans, temperatures, # # etc. Reports details of all failed components. # ######################################################### function ThermalChecks () { # check fans / temps GLOBAL_THERMAL_STATUS=`walk_snmp $OID_cpqHeThermalCondition true` check_snmp_error "$?" "$GLOBAL_THERMAL_STATUS" case "$GLOBAL_THERMAL_STATUS" in 1) THERMAL_STATUS_TEXT="UNKNOWN: Checking thermal conditions for $HOST_NAME not supported! Make sure SNMP is properly configured and all Insight Management Agents are installed." THERMAL_STATE=$STATE_UNK ;; 2) # Thermal status is okay - we are only warning the user of anything that is degraded/failed THERMAL_STATE=$STATE_OK ;; 3|4) # We have failed /degraded status - we need to check individual components - temperatures AND fans # ---------------------------------------- # ----- Check our temperatures first ----- # ---------------------------------------- GLOBAL_TEMP_STATUS=`walk_snmp $OID_cpqHeThermalTempStatus true` check_snmp_error "$?" "$GLOBAL_TEMP_STATUS" THERMAL_STATUS_TEXT="" case "$GLOBAL_TEMP_STATUS" in 1) # A status of 1 is "unknown", but should be implicit if GLOBAL_THERMAL_STATUS is 1. TEMP_STATE=$STATE_UNK ;; 2) # A status of 2 is okay, so we don"t need to check anything TEMP_STATE=$STATE_OK ;; 3) THERMAL_STATUS_TEXT="CRITICAL:" # First, filter for only sensors in degraded state and grab only the "sensor ID" SENSORS_DEGRADED=`walk_snmp $OID_cpqHeTemperatureCondition false` check_snmp_error "$?" "$SENSORS_DEGRADED" SENSORS_DEGRADED=$(echo "$SENSORS_DEGRADED" | grep "INTEGER: 3" | awk -F. '{print $10}' | awk -F= '{print $1}') for CURRENT_SENSOR in "$SENSORS_DEGRADED" do SENSOR_TYPE=`walk_snmp $OID_cpqHeTemperatureLocale.0.$CURRENT_SENSOR true` check_snmp_error "$?" "$SENSOR_TYPE" SENSOR_TEMP=`walk_snmp $OID_cpqHeTemperatureCelsius.0.$CURRENT_SENSOR true` check_snmp_error "$?" "$SENSOR_TEMP" SENSOR_THRESHOLD=`walk_snmp $OID_cpqHeTemperatureThreshold.0.$CURRENT_SENSOR true` check_snmp_error "$?" "$SENSOR_THRESHOLD" case "$SENSOR_TYPE" in 1) SENSOR_DESCR="other";; 2) SENSOR_DESCR="unknown";; 3) SENSOR_DESCR="system";; 4) SENSOR_DESCR="systemBoard";; 5) SENSOR_DESCR="ioBoard";; 6) SENSOR_DESCR="cpu";; 7) SENSOR_DESCR="memory";; 8) SENSOR_DESCR="storage";; 9) SENSOR_DESCR="removableMedia";; 10) SENSOR_DESCR="powerSupply" ;; 11) SENSOR_DESCR="ambient";; 12) SENSOR_DESCR="chassis";; 13) SENSOR_DESCR="bridgeCard";; esac THERMAL_STATUS_TEXT="$THERMAL_STATUS_TEXT $SENSOR_DESCR temperature is $SENSOR_TEMP degrees C. System will shut down at $SENSOR_THRESHOLD degrees C!" done TEMP_STATE=$STATE_CRIT ;; 4) # A status of 4 is one that will shut the system down, so we don't need to handle it ;; esac # ------------------------------- # ----- Check our fans next ----- # ------------------------------- GLOBAL_FAN_STATUS=`walk_snmp $OID_cpqHeThermalSystemFanStatus true` check_snmp_error "$?" "$GLOBAL_FAN_STATUS" case "$GLOBAL_FAN_STATUS" in 1) # This should be implicit by the global thermal status - we don't need details FAN_STATE=$STATE_UNK ;; 2) # This should be implicit by the global thermal status - we don't need details FAN_STATE=$STATE_OK ;; 3|4) # Apparently an individual component can be degraded but this is not set as degraded # So maybe we should not use the global status? if [ $GLOBAL_FAN_STATUS -eq 3 ]; then THERMAL_STATUS_TEXT="WARNING:" FAN_STATE=$STATE_WARN else # Theoretically, we will never see this if it is 4 - the system will shut down # UNLESS cpqHeThermalDegradedAction (1.3.6.1.4.1.232.6.2.6.2)= 2 THERMAL_STATUS_TEXT="CRITICAL:" FAN_STATE=$STATE_CRIT fi # First, filter our conditions for only fans in degraded state FANS_AFFECTED=`walk_snmp $OID_cpqHeFltTolFanCondition false` check_snmp_error "$?" "$FANS_AFFECTED" FANS_AFFECTED=$(echo "$FANS_AFFECTED" | egrep "INTEGER: 3|INTEGER: 4" | awk -F. '{print $10}' | awk -F= '{print $1}') for CURRENT_FAN in "$FANS_AFFECTED" do # We need it's location code FAN_LOCATION=`walk_snmp $OID_cpqHeFltTolFanLocale.0.$CURRENT_FAN true` check_snmp_error "$?" "$FAN_LOCATION" # Get some descriptive text for the fan's location case "$FAN_LOCATION" in 1) FAN_DESCR="other";; 2) FAN_DESCR="unknown";; 3) FAN_DESCR="system";; 4) FAN_DESCR="systemBoard";; 5) FAN_DESCR="ioBoard";; 6) FAN_DESCR="cpu";; 7) FAN_DESCR="memory";; 8) FAN_DESCR="storage";; 9) FAN_DESCR="removableMedia";; 10) FAN_DESCR="powerSupply" ;; 11) FAN_DESCR="ambient";; 12) FAN_DESCR="chassis";; 13) FAN_DESCR="bridgeCard";; esac # We need to differentiate if it's status was degraded or failed FAN_STATUS=`walk_snmp $OID_cpqHeFltTolFanCondition.0.$CURRENT_FAN true` check_snmp_error "$?" "$FAN_STATUS" # Report whether the status was failed/degraded if [ $FAN_STATUS -eq 3 ]; then FAN_STATUS_TEXT="is degraded" else FAN_STATUS_TEXT="has failed" fi # Theoretically, the MIBs state that degraded status means the fan is redundant; failed means it is not # However, I see no reason to NOT double-check that information! FAN_REDUNDANCY=`walk_snmp $OID_cpqHeFltTolFanRedundant.0.$CURRENT_FAN true` check_snmp_error "$?" "$FAN_REDUNDANCY" case "$FAN_REDUNDANCY" in 2) FAN_REDUNDANCY_TEXT="NOT redundant";; 3) FAN_REDUNDANCY_TEXT="redundant";; esac # Determine if the fan can be hot-swapped (so any user looking at the output knows if they"ll have to take the system down IS_FAN_HOT_SWAPPABLE=`walk_snmp $OID_cpqHeFltTolFanHotPlug.0.$CURRENT_FAN true` check_snmp_error "$?" "$IS_FAN_HOT_SWAPPABLE" case "$IS_FAN_HOT_SWAPPABLE" in 2) FAN_SWAPPABLE_TEXT="NOT hot-swappable";; 3) FAN_SWAPPABLE_TEXT="hot-swappable";; esac THERMAL_STATUS_TEXT="$THERMAL_STATUS_TEXT System Fan # $FAN_LOCATION ($FAN_DESCR) $FAN_STATUS_TEXT (fan is $FAN_REDUNDANCY_TEXT, $FAN_SWAPPABLE_TEXT)." done ;; esac # If either component was critical, we want Nagios to return critical if [ $FAN_STATE -o $TEMP_STATE -eq 2 ]; then THERMAL_STATE=$STATE_CRIT else # No component was higher than warning status, so return warning to Nagios THERMAL_STATE=$STATE_WARN fi ;; esac # if [ $THERMAL_STATE -eq 0]; then # If either of the individual return codes was OK, then CPU Fan Status codes must be the failing component # I currently do not have any checks implemented for CPU Fan Status, as it appears the "agent" for this is not common # THERMAL_STATUS_TEXT="UNKNOWN: CPU Fan Status may be failed/degraded. This plugin does not support checking of these items.\n" # fi STATE_STRING="$STATE_STRING $THERMAL_STATUS_TEXT" return $THERMAL_STATE } ######################################################### ## PowerSupplyCheck function ## ######################################################### # Checks all power supplies. Returns status of all # # failed components. # ######################################################### function PowerSupplyCheck () { GLOBAL_PSU_STATUS=`walk_snmp $OID_cpqHeFltTolPwrSupplyCondition true` check_snmp_error "$?" "$GLOBAL_PSU_STATUS" case "$GLOBAL_PSU_STATUS" in 1) STATE_STRING="UNKNOWN: Checking power supplies for $HOST_NAME not supported! Make sure SNMP is properly configured and all Insight Management Agents are installed." PSU_STATE=$STATE_UNK ;; 2) # Power supplies are okay - we are only warning the user of anything that is degraded/failed PSU_STATE=$STATE_OK ;; 3|4) if [ $GLOBAL_PSU_STATUS -eq 3 ]; then STATE_STRING="$STATE_STRING WARNING:" PSU_STATE=$STATE_WARN else STATE_STRING="$STATE_STRING CRITICAL:" PSU_STATE=$STATE_CRIT fi # First, filter our conditions for only PSUs in degraded state PSUS_AFFECTED=`walk_snmp $OID_cpqHeFltTolPowerSupplyCondition false` check_snmp_error "$?" "$PSUs_AFFECTED" PSUS_AFFECTED=$(echo "$PSUS_AFFECTED" | egrep "INTEGER: 3|INTEGER: 4" | awk -F. '{print $10}' | awk -F= '{print $1}') for CURRENT_PSU in "$PSUS_AFFECTED" do # And finally, we can get the error code for the current power supply PSU_ERROR_CODE=`walk_snmp $cpqHeFltTolPowerSupplyStatus.0.$CURRENT_PSU true` check_snmp_error "$?" "$PSU_ERROR_CODE" case "$PSU_ERROR_CODE" in 1) PSU_ERROR_DESCR="No Error";; 2) PSU_ERROR_DESCR="2 (generalFailure)";; 3) PSU_ERROR_DESCR="3 (bistFailure)";; 4) PSU_ERROR_DESCR="4 (fanFailure)";; 5) PSU_ERROR_DESCR="5 (tempFailure)";; 6) PSU_ERROR_DESCR="6 (interlockOpen)";; 7) PSU_ERROR_DESCR="7 (epromFailed)";; 8) PSU_ERROR_DESCR="8 (vrefFailed)";; 9) PSU_ERROR_DESCR="9 (dacFailed)";; 10) PSU_ERROR_DESCR="10 (ramTestFailed)";; 11) PSU_ERROR_DESCR="11 (voltageChannelFailed)";; 12) PSU_ERROR_DESCR="12 (orringdiodeFailed)";; 13) PSU_ERROR_DESCR="13 (brownOut)";; 14) PSU_ERROR_DESCR="14 (giveupOnStartup)";; 15) PSU_ERROR_DESCR="15 (nvramInvalid)";; 16) PSU_ERROR_DESCR="16 (calibrationTableInvalid)";; *) PSU_ERROR_DESCR="";; esac # Report whether the status was failed/degraded PSU_STATUS=`walk_snmp $OID_cpqHeFltTolPowerSupplyCondition.0.$CURRENT_PSU true` check_snmp_error "$?" "$PSU_STATUS" if [ $PSU_STATUS -eq 3 ]; then PSU_STATUS_TEXT="is degraded" else PSU_STATUS_TEXT="has failed" fi # Determine redundant status of current psu PSU_REDUNDANCY=`walk_snmp $OID_cpqHeFltTolPowerSupplyRedundant.0.$CURRENT_PSU true` check_snmp_error "$?" "$PSU_REDUNDANCY" case "$PSU_REDUNDANCY" in 2) PSU_REDUNDANCY_TEXT="Non-redundant";; 3) PSU_REDUNDANCY_TEXT="Redundant";; *) PSU_REDUNDANCY_TEXT="";; esac # Make sure we list all PSU's affected STATE_STRING="$STATE_STRING $PSU_REDUNDANCY_TEXT Power Supply # $CURRENT_PSU $PSU_STATUS_TEXT with error $PSU_ERROR_DESCR." done ;; esac return $PSU_STATE } ######################################################### ## MemoryCheck function ## ######################################################### # Checks memory sticks for any errors they might be # # reporting to the system. I hope to be able to # # implement more checks in the future but did # # not find much useful information in the Compaq # # MIBs for memory information. # ######################################################### function MemoryCheck () { MEMORY_STATE=$STATE_OK MEMORY_STATUS=`walk_snmp $OID_cpqHeCorrMemLogCondition.0 true` check_snmp_error "$?" "$MEMORY_STATUS" case "$MEMORY_STATUS" in 1) MEMORY_STATE=$STATE_UNK STATE_STRING="$STATE_STRING UNKNOWN: Condition of correctable memory is undetermined for this machine." ;; 3|4) # Determine how many memory errors we have gotten on the memory chips NUM_MEM_ERRORS=`walk_snmp $OID_cpqHeCorrMemTotalErrs true` check_snmp_error "$?" "$NUM_MEM_ERRORS" # Use the HP Agents to determine how many memory errors are critical for the sticks installed MEM_CRITICAL=`walk_snmp $OID_cpqHeCorrMemErrorCntThresh true` check_snmp_error "$?" "$MEM_CRITICAL" # Set our warning at 80% of the critical value returned by the HP Agents MEM_WARNING=$(echo $MEM_CRITICAL*0.8/1 | bc) if [ $NUM_MEM_ERRORS -ge $MEM_CRITICAL ]; then STATE_STRING="$STATE_STRING CRITICAL: $NUM_MEM_ERRORS memory errors occured! Please replace the faulty memory!" MEMORY_STATE=$STATE_CRIT else if [ $NUM_MEM_ERRORS -ge $MEM_WARNING ]; then STATE_STRING="$STATE_STRING WARNING: $NUM_MEM_ERRORS memory errors occured! Memory will need to be replaced soon. (CRITICAL = $MEM_CRITICAL)" MEMORY_STATE=$STATE_WARN fi fi ;; esac return $MEMORY_STATE } ######################################################### ## DriveChecks function ## ######################################################### # Checks all hard drives AND drive controllers. # # Returns details on any failed components. # ######################################################### function DriveChecks () { DRIVE_STATUS_TEXT="" CONTROLLERS_AFFECTED=`walk_snmp $OID_cpqDaCntlrCondition false` check_snmp_error "$?" "$CONTROLLERS_AFFECTED" CONTROLLERS_AFFECTED=$(echo "$CONTROLLERS_AFFECTED" | egrep "INTEGER: 3|INTEGER: 4" | awk -F. '{print $9}' | awk -F= '{print $1}') for CURRENT_CONTROLLER in "$CONTROLLERS_AFFECTED" do CONTROLLER_LOCATION=`walk_snmp $OID_cpqDaCntlrHwLocation.$CURRENT_CONTROLLER true` CONTROLLER_STATUS=`walk_snmp $OID_cpqDaCntlrCondition.$CURRENT_CONTROLLER true` case "$CONTROLLER_STATUS" in 1) # Unknown DRIVE_STATUS_TEXT="UNKNOWN: Checking drive conditions is not supported for $HOST_NAME! Make sure SNMP is properly configured and that all Insight Management Agents are installed." DRIVE_STATE=$STATE_UNK ;; 2) # Everything is okay, no need to report anything DRIVE_STATE=$STATE_OK ;; 3) # One or more components may be failed, but not all. The array controller is still usable but degraded DRIVE_STATUS_TEXT="Array Controller at $CONTROLLER_LOCATION is degraded. REASON - " LOGICAL_DRIVES_AFFECTED=`walk_snmp $OID_cpqDaLogDrvCondition false` check_snmp_error "$?" "$LOGICAL_DRIVES_AFFECTED" LOGICAL_DRIVES_AFFECTED=$(echo "$LOGICAL_DRIVES_AFFECTED" | egrep "INTEGER: 3|INTEGER: 4" | awk -F. '{print $10}' | awk -F= '{print $1}') #------------------------------------ #----- Check out logical drives ----- #------------------------------------ for CURRENT_LOGICAL_DRIVE in "$LOGICAL_DRIVES_AFFECTED" do # Determine some basic identities/properties of this drive LOGICAL_DRIVE_TEXT=`walk_snmp $OID_cpqDaLogDrvOsName.$CURRENT_CONTROLLER.$CURRENT_LOGICAL_DRIVE true` check_snmp_error "$?" "$LOGICAL_DRIVE_TEXT" LOGICAL_DRIVE_ERROR=`walk_snmp $OID_cpqDaLogDrvStatus.$CURRENT_CONTROLLER.$CURRENT_LOGICAL_DRIVE true` check_snmp_error "$?" "$LOGICAL_DRIVE_ERROR" case "$LOGICAL_DRIVE_ERROR" in 1) LOGICAL_DRIVE_ERROR_DESCR="1 (other)";; 2) LOGICAL_DRIVE_ERROR_DESCR="2 (ok)";; 3) LOGICAL_DRIVE_ERROR_DESCR="3 (failed)";; 4) LOGICAL_DRIVE_ERROR_DESCR="4 (unconfigured)";; 5) LOGICAL_DRIVE_ERROR_DESCR="5 (recovering)";; 6) LOGICAL_DRIVE_ERROR_DESCR="6 (readyForRebuild)";; 7) LOGICAL_DRIVE_ERROR_DESCR="7 (rebuilding)";; 8) LOGICAL_DRIVE_ERROR_DESCR="8 (wrongDrive)";; 9) LOGICAL_DRIVE_ERROR_DESCR="9 (badConnect)";; 10) LOGICAL_DRIVE_ERROR_DESCR="10 (overheating)";; 11) LOGICAL_DRIVE_ERROR_DESCR="11 (shutdown)";; 12) LOGICAL_DRIVE_ERROR_DESCR="12 (expanding)";; 13) LOGICAL_DRIVE_ERROR_DESCR="13 (notAvailable)";; 14) LOGICAL_DRIVE_ERROR_DESCR="14 (queuedForExpansion)";; esac LOGICAL_DRIVE_STATUS=`walk_snmp $OID_cpqDaLogDrvCondition.$CURRENT_CONTROLLER.$CURRENT_LOGICAL_DRIVE true` check_snmp_error "$?" "$LOGICAL_DRIVE_STATUS" # Report whether the status was failed/degraded if [ $LOGICAL_DRIVE_STATUS -eq 3 ]; then LOGICAL_DRIVE_STATUS_TEXT="is degraded" else LOGICAL_DRIVE_STATUS_TEXT="has failed" fi DRIVE_STATUS_TEXT"$DRIVE_STATUS_TEXT Logical Drive $LOGICAL_DRIVE_TEXT $LOGICAL_DRIVE_STATUS_TEXT with error $LOGICAL_DRIVE_ERROR_DESCR." done #------------------------------------- #----- Check out physical drives ----- #------------------------------------- PHYSICAL_DRIVES_AFFECTED=`walk_snmp $OID_cpqDaPhyDrvCondition false` check_snmp_error "$?" "$PHYSICAL_DRIVES_AFFECTED" PHYSICAL_DRIVES_AFFECTED=$(echo "$PHYSICAL_DRIVES_AFFECTED" | egrep "INTEGER: 3|INTEGER: 4" | awk -F. '{print $10}' | awk -F= '{print $1}') for CURRENT_PHYSICAL_DRIVE in "$PHYSICAL_DRIVES_AFFECTED" do # Determine whether or not this drive is degraded/failed PHYSICAL_DRIVE_STATUS=`walk_snmp $OID_cpqDaPhyDrvCondition.$CONTROLLER_INDEX.$CURRENT_PHYSICAL_DRIVE true` check_snmp_error "$?" "$PHYSICAL_DRIVE_STATUS" # Determine some basic identities/properties of this drive PHYSICAL_DRIVE_BAY=`walk_snmp $OID_cpqDaPhyDrvLocationString.$CONTROLLER_INDEX.$CURRENT_PHYSICAL_DRIVE true` check_snmp_error "$?" "$PHYSICAL_DRIVE_BAY" PHYSICAL_DRIVE_ERROR=`walk_snmp $OID_cpqDaPhyDrvStatus.$CONTROLLER_INDEX.$CURRENT_PHYSICAL_DRIVE true` check_snmp_error "$?" "$PHYSICAL_DRIVE_ERROR" PHYSICAL_DRIVE_MODEL=`walk_snmp $OID_cpqDaPhyDrvModel.$CONTROLLER_INDEX.$CURRENT_PHYSICAL_DRIVE true` check_snmp_error "$?" "$PHYSICAL_DRIVE_MODEL" case "$PHYSICAL_DRIVE_ERROR" in 1) PHYSICAL_DRIVE_ERROR_DESCR="1 (other)";; 2) PHYSICAL_DRIVE_ERROR_DESCR="2 (ok)";; 3) PHYSICAL_DRIVE_ERROR_DESCR="3 (failed)";; 4) PHYSICAL_DRIVE_ERROR_DESCR="4 (predictiveFailure)";; esac if [ $PHYSICAL_DRIVE_STATUS -eq 3 ]; then PHYSICAL_DRIVE_STATUS_TEXT="degraded" else PHYSICAL_DRIVE_STATUS_TEXT="failed" fi DRIVE_STATUS_TEXT="$DRIVE_STATUS_TEXT $PHYSICAL_DRIVE_BAY is $PHYSICAL_DRIVE_STATUS_TEXT with error $PHYSICAL_DRIVE_ERROR_DESCR." done # Currently, this function checks drives in much the way that Systems Management would. # Controller condition is based on the logical drive status, and logical drive status # is based on physical drive status. The error codes returned by this plugin should match # the Systems Management global page. # The problem with this approach is that if a physical drive is failed, the global status # is not considered critical. But we want Nagios to alert us! # So here, rather than assume a warning status only, I want to report the worst # condition reported from all the subcomponents. This means eventhough the controller might be # okay - redundancy is in effect and the server is operating - we can still get notified of a failed # drive and have the entire thing treated as critical (since at least 1 subcomponent if [ $LOGICAL_DRIVE_STATUS -eq 4 ] || [ $PHYSICAL_DRIVE_STATUS -eq 4 ]; then DRIVE_STATUS_TEXT="CRITICAL: $DRIVE_STATUS_TEXT" DRIVE_STATE=$STATE_CRIT else if [ $LOGICAL_DRIVE_STATUS -eq 3 ] || [ $PHYSICAL_DRIVE_STATUS -eq 3 ]; then # Something has to have flagged critical # We know all components are at warning DRIVE_STATUS_TEXT="WARNING: $DRIVE_STATUS_TEXT" else DRIVE_STATUS_TEXT="WARNING: There appears to be a problem on the controller at $CONTROLLER_LOCATION, but no drives are effected. Possible causes are outdated firmware, a tape drive needs cleaned, etc. Check Systems Management homepage for more details.\n" fi DRIVE_STATE=$STATE_WARN fi ;; 4) # A "failed" state means all individual components have "failed" - if any 1 component is "degraded" the system returns a degraded instead DRIVE_STATUS_TEXT="CRITICAL: Array Controller at $CONTROLLER_LOCATION has failed!" DRIVE_STATE=$STATE_CRIT ;; esac done STATE_STRING="$STATE_STRING $DRIVE_STATUS_TEXT" return $DRIVE_STATE } ######################################################### # Subroutine: walk_snmp # ######################################################### walk_snmp(){ if [ $2 = true ]; then OUTPUT_OPTIONS="-Oavq" else OUTPUT_OPTIONS="-Oa" fi if [[ $SNMPVERSION = 3 ]]; then RESULT_TEXT=`snmpwalk -v $SNMPVERSION $OUTPUT_OPTIONS -u $SNMPUSER -l $PRIVILEGELEVEL -x $PRIVACYPROTOCOL -X $PRIVACYPASSWORD -a $AUTHPROTOCOL -A $AUTHPASSWORD $HOST_NAME $1` RESULT_CODE=$? else # Check if community was also set RESULT_TEXT=`snmpwalk -v $SNMPVERSION $OUTPUT_OPTIONS -c $COMMUNITY $HOST_NAME $1` RESULT_CODE=$? fi if [[ $RESULT_CODE -ne 0 ]]; then echo "Plugin $APPNAME failure - snmpwalk command error." echo "$RESULT_TEXT" exit $STATE_UNK fi if [ $2 = true ]; then echo "$RESULT_TEXT" | sed -e "s/^[[:space:]]*//" | tr -d "\"" else echo "$RESULT_TEXT" fi } ######################################################### # Subroutine: get_snmp # ######################################################### get_snmp(){ if [[ $SNMPVERSION = 3 ]]; then RESULT_TEXT=`snmpget -v $SNMPVERSION -Oavq -u $SNMPUSER -l $PRIVILEGELEVEL -x $PRIVACYPROTOCOL -X $PRIVACYPASSWORD -a $AUTHPROTOCOL -A $AUTHPASSWORD $HOST_NAME $1` RESULT_CODE=$? else # Check if community was also set RESULT_TEXT=`snmpget -v $SNMPVERSION -Oavq -c $COMMUNITY $HOST_NAME $1` RESULT_CODE=$? fi if [[ $RESULT_CODE -ne 0 ]]; then echo "Plugin $APPNAME failure - snmpget command error." echo $RESULT_TEXT | tr -d "\"" exit $STATE_UNK fi echo $RESULT_TEXT | tr -d "\"" } ######################################################### # Subroutine: check_snmp_error # ######################################################### # Tests errors returned by function operations # ######################################################### check_snmp_error(){ if [[ $1 -ne 0 ]]; then echo $2 exit $STATE_UNK fi } ######################################################### ## MAIN CODE ## ######################################################### # Check that all required binaries for the script are available # EXIT with an UNKNOWN status if not binaries="snmpwalk awk cut grep wc" for required_binary in $binaries; do which $required_binary > /dev/null if [ "$?" != "0" ];then echo "UNKNOWN: $APPNAME: No usable "$required_binary" binary in "$PATH"" exit 3 fi done # Check to see if any parameters were passed if [[ $# -eq 0 ]]; then print_usage exit $STATE_UNK fi # Parse our options as passed, and make sure things are peachy while test -n "$1"; do case "$1" in --host|-H) HOST_NAME=$2 shift ;; --comunity|-C) COMMUNITY=$2 shift ;; --snmpversion|-v) SNMPVERSION=$2 shift ;; --user|-u) SNMPUSER=$2 shift ;; --privelegelevel|-l) PRIVILEGELEVEL=$2 shift ;; --authprotocol|-a) AUTHPROTOCOL=$2 shift ;; --authpassword|-A) AUTHPASSWORD=$2 shift ;; --privacyprotocol|-x) PRIVACYPROTOCOL=$2 shift ;; --privacypassword|-X) PRIVACYPASSWORD=$2 shift ;; --help|-h) print_help ;; --version|-V) print_version exit $STATE ;; *) echo "Unknown argument: $1" print_usage exit $STATE_UNK ;; esac shift done # Determine health of most major components (hard drives are under a differet MIB tree and are not included) COMPONENT_SYSTEM_STATUS=`walk_snmp $OID_cpqHeMibCondition true` check_snmp_error "$?" "$COMPONENT_SYSTEM_STATUS" if [ $? -ne 0 ]; then #Check for server response - is SNMP even setup okay? STATE_STRING="WARNING: No SNMP response from $HOST_NAME! Make sure host is up and SNMP is configured properly." STATE=$STATE_WARN else case "$COMPONENT_SYSTEM_STATUS" in 1) # A value of 1 indicates that the system agents aren"t working or SNMP could be broke - no sense going further! STATE_STRING="UNKNOWN: Check your SNMP configuration and be sure all Insight Agents are installed." STATE=$STATE_UNK ;; 2) # A value of 2 means everything is sweet with regular components; we don't know about drive status though DriveChecks DRIVE_STATUS=$? if [ $DRIVE_STATUS -gt 0 ]; then # Eventhough other components are okay, drives are not - let's report their condition STATE=$DRIVE_STATUS else # Everything is fine - let's output some info. on the server SERVER_SERIAL_NUMBER=`walk_snmp $OID_cpqSiSysServiceNum true` check_snmp_error "$?" "$SERVER_SERIAL_NUMBER SERVER_MODEL_NUMBER=`walk_snmp $OID_cpqSiProductName true` check_snmp_error "$?" "$SERVER_MODEL_NUMBER STATE_STRING="OK: All system components are normal. Server is a $SERVER_MODEL_NUMBER with serial #$SERVER_SERIAL_NUMBER." STATE=$STATE_OK fi ;; 3|4) # 3 is degraded status, 4 is failed - we need to investigate for more details # let's run checks of all the components that we can, and report details ony on warnings/failures. # This builds a culmulative output, in the event that more than 1 component has failed ThermalChecks PowerSupplyCheck MemoryCheck DriveChecks if [ $COMPONENT_SYSTEM_STATUS -eq 3 ]; then STATE=$STATE_WARN else STATE=$STATE_CRIT fi ;; esac fi echo "$STATE_STRING" exit $STATE