#!/bin/bash ######################################################################################### # Author : Patricio M Dorantes Jamarne # # Script name : check_db2 # # Funcionality : This script checks BP,TS and DB log utilization # # on all the instances and databases avilable on the # # db2 server # # # # Version : 1.0 # # Date : 2013-06-25 # # # # Usage : ./check_db2 # # check_db2 assumes root permissions which are acomplished by # # giving +s permissions to the suid wrapper db2.check and read # # permissions to the group of a nagios user. # # # # This scripts checks ALL DB2 instances and its databases. If you # # like to override this create a file under: # # /var/cache/check_db2/db2-ignores.cfg # # with the variable MON_IGNORE=" ." # # so this script will ignore all the instances that are writed like # # and all the dbs that are writed like . # # # # Please note that this script will take a long run on the first time # # in order to write /var/cache/check_db2/db2-envs.cache # # # # # # # # Notes and : I'm no expert at db2, I'm just a IBM WebSphere Portal # # special thanks administrator that got a lot of trouble from the database engine. # # # # This script is helping me a lot to know db2 internals and to get # # some health checks from them. # # # # I got inspired by the work of Felipe Alkain de Souza who # # created some really cool scripts. THANKS A LOT! # # # # 2Do : Ok, I think this script is not the best implementation of nothing # # surely this code needs a lot of improvement (like the way to handle # # how to store the return of each job) please, if you find useful # # this script and you code something SHARE, I'm sharing you this in # # the hope of being useful, learn more and get better ways to do # # things. Thank you in advance # # # ######################################################################################### DB2_INSTALL_ROOT="/opt/IBM/db2/V10.1/" MON_DB2_INSTANCES=0 MON_DB2_DATABASES=0 MON_MON_NAGIOS_RET_CODE=0 MON_NAGIOS_OK=0 MON_NAGIOS_CRIT=1 MON_NAGIOS_WARN=2 MON_NAGIOS_UNKN=3 function get_instance_db_mon_data { . /var/cache/check_db2/db2-envs.cache INSTANCE=$1 DB=$2 tmp_mon_cfg=$(eval "echo \"`echo $\{$INSTANCE[$DB]\}`\""|base64 -d |bunzip2) # 2Do: Detect AUTOMATIC VALUES and set no boundaries on calcs MON_DB_LOG_PRI=`echo "$tmp_mon_cfg" |awk -F'=' '/LOGPRIMARY/{print $2}'` MON_DB_LOG_SEC=`echo "$tmp_mon_cfg" |awk -F'=' '/LOGSECOND/{print $2}'` MON_DB_LOG_SIZE=`echo "$tmp_mon_cfg" |awk -F'=' '/LOGFILSIZ/{print $2}'` # LOG SIZE ON BYTES MON_DB_LOG_SIZE=`echo "$MON_DB_LOG_SIZE*4*1024"|bc -l` MON_DB_TOTAL_LOCKLIST=`echo "$tmp_mon_cfg" | sed -e "s/AUTOMATIC(//" -e "s/)//g" |awk -F'=' '/LOCKLIST/{print $2}'` MON_DB_TOTAL_LOCKLIST=`echo "$MON_DB_TOTAL_LOCKLIST*4*1024"|bc -l` MON_DB_LOCKS_WAITING_WARN=100 MON_DB_LOCKS_WAITING_CRIT=200 #MON_DB_LOG_PATH=`echo "$tmp_mon_cfg" |awk -F'=' '/Path to log files/{print $2}'` #MON_DB_LOG_USED=`ls $MON_DB_LOG_PATH |wc -l` #echo "$INSTANCE.$DB.log=($MON_DB_LOG_PRI+$MON_DB_LOG_SEC)*$MON_DB_LOG_SIZE/4*1024=`echo "($MON_DB_LOG_PRI+$MON_DB_LOG_SEC)*$MON_DB_LOG_SIZE/4*1024"|bc -l`" # printf "$INSTANCE.$DB.log=`echo \"scale=2;($MON_DB_LOG_USED/($MON_DB_LOG_PRI+($MON_DB_LOG_SEC)))*100\"| # bc -l`%%;`echo \"scale=2;($MON_DB_LOG_PRI+($MON_DB_LOG_SEC/2))/($MON_DB_LOG_PRI+$MON_DB_LOG_SEC)*100\"| # bc -l`%%;90%% $INSTANCE.$DB.logsize=`echo \"scale=2;($MON_DB_LOG_USED*${MON_DB_LOG_SIZE})\"|bc -l`MiB " # SNAPSHOT_DATABASE data : #SNAPSHOT_TIMESTAMP,SEC_LOG_USED_TOP,TOT_LOG_USED_TOP,TOTAL_LOG_USED,TOTAL_LOG_AVAILABLE,ROWS_READ,POOL_DATA_L_READS,POOL_DATA_P_READS,POOL_DATA_WRITES,POOL_INDEX_L_READS,POOL_INDEX_P_READS,POOL_INDEX_WRITES,POOL_READ_TIME,POOL_WRITE_TIME,POOL_ASYNC_INDEX_READS,POOL_DATA_TO_ESTORE,POOL_INDEX_TO_ESTORE,POOL_INDEX_FROM_ESTORE,POOL_DATA_FROM_ESTORE,POOL_ASYNC_DATA_READS,POOL_ASYNC_DATA_WRITES,POOL_ASYNC_INDEX_WRITES,POOL_ASYNC_READ_TIME,POOL_ASYNC_WRITE_TIME,POOL_ASYNC_DATA_READ_REQS,DIRECT_READS,DIRECT_WRITES,DIRECT_READ_REQS,DIRECT_WRITE_REQS,DIRECT_READ_TIME,DIRECT_WRITE_TIME,UNREAD_PREFETCH_PAGES,FILES_CLOSED,POOL_LSN_GAP_CLNS,POOL_DRTY_PG_STEAL_CLNS,POOL_DRTY_PG_THRSH_CLNS,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,TOTAL_SORTS,TOTAL_SORT_TIME,SORT_OVERFLOWS,ACTIVE_SORTS,COMMIT_SQL_STMTS,ROLLBACK_SQL_STMTS,DYNAMIC_SQL_STMTS,STATIC_SQL_STMTS,FAILED_SQL_STMTS,SELECT_SQL_STMTS,DDL_SQL_STMTS,UID_SQL_STMTS,INT_AUTO_REBINDS,INT_ROWS_DELETED,INT_ROWS_UPDATED,INT_COMMITS,INT_ROLLBACKS,INT_DEADLOCK_ROLLBACKS,ROWS_DELETED,ROWS_INSERTED,ROWS_UPDATED,ROWS_SELECTED,BINDS_PRECOMPILES,TOTAL_CONS,APPLS_CUR_CONS,APPLS_IN_DB2,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS,CONNECTIONS_TOP,DB_HEAP_TOP,INT_ROWS_INSERTED,LOG_READS,LOG_WRITES,PKG_CACHE_LOOKUPS,PKG_CACHE_INSERTS,CAT_CACHE_LOOKUPS,CAT_CACHE_INSERTS,CAT_CACHE_OVERFLOWS,CAT_CACHE_HEAP_FULL,CATALOG_PARTITION,TOTAL_SEC_CONS,NUM_ASSOC_AGENTS,AGENTS_TOP,COORD_AGENTS_TOP,PREFETCH_WAIT_TIME,APPL_SECTION_LOOKUPS,APPL_SECTION_INSERTS,TOTAL_HASH_JOINS,TOTAL_HASH_LOOPS,HASH_JOIN_OVERFLOWS,HASH_JOIN_SMALL_OVERFLOWS,PKG_CACHE_NUM_OVERFLOWS,PKG_CACHE_SIZE_TOP,DB_CONN_TIME,SQLM_ELM_LAST_RESET,SQLM_ELM_LAST_BACKUP,APPL_CON_TIME,ELAPSED_EXEC_TIME_S,ELAPSED_EXEC_TIME_MS,DB_LOCATION,SERVER_PLATFORM,APPL_ID_OLDEST_XACT,CATALOG_PARTITION_NAME,INPUT_DB_ALIAS,DB_NAME,DB_PATH TMP_VARS_FILE=`mktemp --tmpdir=/tmp/` chown $INSTANCE $TMP_VARS_FILE timeout 5 su - ${INSTANCE} -c " db2 connect to $DB >/dev/null; #SNAPSHOT=\"\`db2 get snapshot for all on $DB\`\" #printf \"$INSTANCE.$DB.locks_number=\" #db2 -x \"select TOTAL_LOG_USED,TOTAL_LOG_AVAILABLE,SEC_LOG_USED_TOP,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS FROM TABLE(SNAPSHOT_DATABASE('$DB', -1))\" db2 -x \"select TOTAL_LOG_USED,TOTAL_LOG_AVAILABLE,SEC_LOG_USED_TOP,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS FROM TABLE(SNAPSHOT_DATABASE('$DB', -1))\" | xargs echo | awk '{ print \"\\nTOTAL_LOG_USED=\" \$1 \"\\nTOTAL_LOG_AVAILABLE=\" \$2 \"\\nSEC_LOG_USED_TOP=\" \$3 \"\\nLOCKS_HELD=\" \$4 \"\\nLOCK_WAITS=\" \$5 \"\\nLOCK_WAIT_TIME=\" \$6 \"\\nLOCK_LIST_IN_USE=\" \$7 \"\\nDEADLOCKS=\" \$8 \"\\nLOCK_ESCALS=\" \$9 \"\\nX_LOCK_ESCALS=\" \$10 \"\\nLOCKS_WAITING=\" \$11 \"\\nSORT_HEAP_ALLOCATED=\" \$12 \"\\nSEC_LOGS_ALLOCATED=\" \$13 \"\\nDB_STATUS=\" \$14 \"\\nLOCK_TIMEOUTS=\" \$15}' > $TMP_VARS_FILE db2 -x \"WITH BPMETRICS AS ( SELECT bp_name, pool_data_l_reads + pool_temp_data_l_reads + pool_index_l_reads + pool_temp_index_l_reads + pool_xda_l_reads + pool_temp_xda_l_reads as logical_reads, pool_data_p_reads + pool_temp_data_p_reads + pool_index_p_reads + pool_temp_index_p_reads + pool_xda_p_reads + pool_temp_xda_p_reads as physical_reads, member FROM TABLE(MON_GET_BUFFERPOOL('',-2)) AS METRICS) SELECT VARCHAR(bp_name,20) AS bp_name, CASE WHEN logical_reads > 0 THEN DEC((1 - (FLOAT(physical_reads) / FLOAT(logical_reads))) * 100,5,2) ELSE NULL END AS HIT_RATIO FROM BPMETRICS\" | tr '-' '0' | awk '{print \$1\"=\"\$2}' | xargs echo | xargs echo -n \"BUFFERPOOLS=\\\"\" >> $TMP_VARS_FILE echo \"\\\"\" >> $TMP_VARS_FILE db2 -x \"select TABLESPACE_NAME,(USED_PAGES),USABLE_PAGES from table (snapshot_tbs_cfg('\$1', 0))\" | awk '{print \$1\"=\"\$2/\$3*100\"%\"}'| xargs echo -n \"TABLESPACES=\\\"\" >> $TMP_VARS_FILE echo \"\\\"\" >> $TMP_VARS_FILE #db2 -x \"select SWAP_PAGES_IN,SWAP_PAGES_OUT from table(SYSPROC.ENV_GET_SYSTEM_RESOURCES())\" | xargs echo | awk '{print \"\\nSWAP_PAGES_IN=\" \$1 \"\\nSWAP_PAGES_OUT=\" \$2 }' >> $TMP_VARS_FILE # Problem to this: () creates a new shell without the db2 connect #read TOTAL_LOG_USED TOTAL_LOG_AVAILABLE,SEC_LOG_USED_TOP,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS< <(db2 -x \"select TOTAL_LOG_USED,TOTAL_LOG_AVAILABLE,SEC_LOG_USED_TOP,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS FROM TABLE(SNAPSHOT_DATABASE('$DB', -1))\" | xargs echo) #db2 -x \"select TOTAL_LOG_USED,TOTAL_LOG_AVAILABLE,SEC_LOG_USED_TOP,LOCKS_HELD,LOCK_WAITS,LOCK_WAIT_TIME,LOCK_LIST_IN_USE,DEADLOCKS,LOCK_ESCALS,X_LOCK_ESCALS,LOCKS_WAITING,SORT_HEAP_ALLOCATED,SEC_LOGS_ALLOCATED,DB_STATUS,LOCK_TIMEOUTS FROM TABLE(SNAPSHOT_DATABASE('jcrdb', -1))\" #| xargs printf #printf \" $INSTANCE.$DB.locks_wait_time=\" #printf \"\$SNAPSHOT\" | grep \"Total wait time\" | awk -F '=' '{printf \$2}' | xargs echo | tr ' ' '+' |bc -l |xargs printf #printf \"ms \" db2 terminate >/dev/null; #debug make this cmd fail #if [ \"$INSTANCE.$DB\" == \"db2frm1.JCRDB\" ]; then # sleep 15 #fi " 2>/dev/null MON_SU_DB2_CONNECT_RET_CODE=$? if [ $MON_SU_DB2_CONNECT_RET_CODE -eq 0 ]; then . $TMP_VARS_FILE #cat $TMP_VARS_FILE echo -n "$INSTANCE.$DB.total_log_used=`echo "scale=2;$TOTAL_LOG_USED/(($MON_DB_LOG_PRI+$MON_DB_LOG_SEC)*$MON_DB_LOG_SIZE)*100"|bc -l`%;`echo \"scale=2; ($MON_DB_LOG_PRI/($MON_DB_LOG_PRI+$MON_DB_LOG_SEC))*100\"|bc -l`%;95% " echo -n "$INSTANCE.$DB.locks_waiting=$LOCKS_WAITING;${MON_DB_LOCKS_WAITING_WARN};${MON_DB_LOCKS_WAITING_CRIT} " echo -n "$INSTANCE.$DB.locklist=`echo \"scale=2; $LOCK_LIST_IN_USE/$MON_DB_TOTAL_LOCKLIST*100\"|bc -l`%;75%;90% " echo -n "$INSTANCE.$DB.sort_heap=$SORT_HEAP_ALLOCATED " #echo -n "$INSTANCE.$DB.swap=$SWAP_PAGES_IN / $SWAP_PAGES_OUT " for BUFFERPOOL in $BUFFERPOOLS; do echo -n "$INSTANCE.$DB.bufferpool.$BUFFERPOOL " done for TABLESPACE in $TABLESPACES; do echo -n "$INSTANCE.$DB.tablespace.$TABLESPACE " done fi rm $TMP_VARS_FILE return $MON_SU_DB2_CONNECT_RET_CODE } # 2Do: Check that all the required monitor # switches are correcty configured # # db2 connect to # db2 get monitor switches # BUFFERPOOL # LOCK # SORT # STATEMENT # TABLE # TIMESTAMP # UOW # Transaction processor monitor name (TP_MON_NAME) = #db2 update dbm cfg using DFT_MON_LOCK on # Buffer pool (DFT_MON_BUFPOOL) = OFF # Lock (DFT_MON_LOCK) = OFF # Sort (DFT_MON_SORT) = OFF # Statement (DFT_MON_STMT) = OFF # Table (DFT_MON_TABLE) = OFF # Timestamp (DFT_MON_TIMESTAMP) = ON # Unit of work (DFT_MON_UOW) = OFF # Monitor health of instance and databases (HEALTH_MON) = OFF # Make sure our files are keep only root readable umask 077 # Check for the check_db2 cache directory or create it if [ ! -d /var/cache/check_db2/ ]; then mkdir -p /var/cache/check_db2/ fi # 2Do: Dont load ignores into the cache file # Load ignore variables # MON_IGNORE="" if [ ! -f /var/cache/check_db2/db2-ignores.cfg ]; then touch /var/cache/check_db2/db2-ignores.cfg fi . /var/cache/check_db2/db2-ignores.cfg # Check for the cache file and include it, or make it again if [ -f /var/cache/check_db2/db2-envs.cache ]; then . /var/cache/check_db2/db2-envs.cache else # Generate the cache file for check_db2 echo "FILE_GENERATION_START=\"`date +\"%Y-%m-%d %T.%N\"`\"" > /var/cache/check_db2/db2-envs.cache echo "LICM_EXPIRATION=\"`${DB2_INSTALL_ROOT}/adm/db2licm -l|base64`\"" >> /var/cache/check_db2/db2-envs.cache echo "DB2_INSTANCES=\"`${DB2_INSTALL_ROOT}/instance/db2ilist|base64`\"" >> /var/cache/check_db2/db2-envs.cache . /var/cache/check_db2/db2-envs.cache for INSTANCE in `echo "${DB2_INSTANCES}" |base64 -d`; do # Create an Associative Array with the instance name echo "declare -A $INSTANCE" >> /var/cache/check_db2/db2-envs.cache # Save DBs contained on it echo "$INSTANCE[DBs]=\"`su - $INSTANCE -c \"db2 list db directory | awk '/Database alias/{print \\$4 }'\"|base64`\"" >> /var/cache/check_db2/db2-envs.cache done . /var/cache/check_db2/db2-envs.cache for INSTANCE in `echo "${DB2_INSTANCES}" |base64 -d`; do # Check ignored instances TMP_IGNORE_INSTANCE=0 for TMP_TRY_INSTANCE in $MON_IGNORE; do if [ $TMP_TRY_INSTANCE == "$INSTANCE" ]; then TMP_IGNORE_INSTANCE=1 break fi done if [ $TMP_IGNORE_INSTANCE -gt 0 ]; then continue fi let MON_DB2_INSTANCES=$MON_DB2_INSTANCES+1 eval "array=(\${$INSTANCE[@]})" for DB in `echo "${array[DBs]}"|base64 -d`; do # Check ignored databases TMP_IGNORE_DB=0 for TMP_TRY_DB in $MON_IGNORE; do if [ $TMP_TRY_DB == "${INSTANCE}.${DB}" ]; then TMP_IGNORE_DB=1 break fi done if [ $TMP_IGNORE_DB -gt 0 ]; then continue fi let MON_DB2_DATABASES=$MON_DB2_DATABASES +1 echo "$INSTANCE[$DB]=\"`su - $INSTANCE -c \"db2 get db cfg for $DB|bzip2|base64\"`\"" >> /var/cache/check_db2/db2-envs.cache done done echo "MON_DB2_INSTANCES=$MON_DB2_INSTANCES" >> /var/cache/check_db2/db2-envs.cache echo "MON_DB2_DATABASES=$MON_DB2_DATABASES" >> /var/cache/check_db2/db2-envs.cache echo "FILE_GENERATION_END=\"`date +\"%Y-%m-%d %T.%N\"`\"" >> /var/cache/check_db2/db2-envs.cache fi #cat /var/cache/check_db2/db2-envs.cache #rm -f /var/cache/check_db2/db2-envs.cache # 2Do: use an array MON_MON_JOBS="" declare -A MON_MON_JOBS_FILES for INSTANCE in `echo "${DB2_INSTANCES}" |base64 -d`; do TMP_IGNORE_INSTANCE=0 for TMP_TRY_INSTANCE in $MON_IGNORE; do if [ $TMP_TRY_INSTANCE == "$INSTANCE" ]; then TMP_IGNORE_INSTANCE=1 break fi done if [ $TMP_IGNORE_INSTANCE -gt 0 ]; then continue fi DBS=$(eval "echo `echo $\{$INSTANCE[DBs]\}`" |base64 -d) for DB in ${DBS}; do TMP_IGNORE_DB=0 for TMP_TRY_DB in $MON_IGNORE; do if [ $TMP_TRY_DB == "${INSTANCE}.${DB}" ]; then TMP_IGNORE_DB=1 break fi done if [ $TMP_IGNORE_DB -gt 0 ]; then continue fi JOB_FILE=`mktemp --tmpdir=/tmp/` get_instance_db_mon_data $INSTANCE $DB > $JOB_FILE & JOB_ID=$! MON_MON_JOBS="$MON_MON_JOBS $INSTANCE.$DB=$JOB_ID" MON_MON_JOBS_FILES[$JOB_ID]="$JOB_FILE" done done # Get job control MON_NAGIOS_MSG="check_db2 -" for JOB in $MON_MON_JOBS; do JOB_DATA=`echo $JOB | awk -F'=' '{print $1}'` JOB_ID=`echo $JOB | awk -F'=' '{print $2}'` wait $JOB_ID JOB_STATUS=$? #2Do: Get different messages from return codes ie. 124 is this error below if [ $JOB_STATUS -gt 0 ]; then MON_NAGIOS_MSG="${MON_NAGIOS_MSG} Critical no data from: $JOB_DATA " MON_MON_NAGIOS_RET_CODE=$MON_NAGIOS_CRIT fi done if [ $MON_MON_NAGIOS_RET_CODE -eq 0 ]; then MON_NAGIOS_MSG="$MON_NAGIOS_MSG Ok" fi MON_NAGIOS_MSG="$MON_NAGIOS_MSG |" # Retrieve job data for JOB in $MON_MON_JOBS; do JOB_ID=`echo $JOB | awk -F'=' '{print $2}'` MON_NAGIOS_MSG="$MON_NAGIOS_MSG `cat ${MON_MON_JOBS_FILES[$JOB_ID]}`" rm ${MON_MON_JOBS_FILES[$JOB_ID]} done # # #MON_MON_NAGIOS_RET_CODE=0 #MON_NAGIOS_OK=0 #MON_NAGIOS_CRIT=1 #MON_NAGIOS_WARN=2 #MON_NAGIOS_UNKN=3 echo "$MON_NAGIOS_MSG" exit $MON_MON_NAGIOS_RET_CODE