#!/bin/ksh # --------------------------------------------------------------------- # # Copyright Ingres Corporation 2010 # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 only, as published by # the Free Software Foundation. You should have received a copy of version 2 of the GNU # General Public License along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For further information: # Web: www.ingres.com # Email: services@ingres.com # # --------------------------------------------------------------------- # # Name: # check_isa_ingres_lrq_report # # Usage: # check_isa_ingres_lrq_report \ # --instance \ # --database # # Description: # Reports the findings of check_isa_ingres_lrq which due to its long running # nature has to be run outside Nagios. # # The check_isa_ingres_lrq monitors an Ingres installation for long running # queries # # History: # 1.0 28-Jan-2010 (mark.whalley@ingres.com) # Created. # # 1.1 16-feb-2010 (mark.whalley@ingres.com) # Extended IMA domain to all DBMS servers # # 1.2 18-feb-2010 (mark.whalley@ingres.com) # Improvements to messages reported back to Nagios # # 1.2 07-Apr-2010 (mark.whalley@ingres.com) # Introduced local_data_dir to store DAT files # # 1.3 29-Jun-2010 (mark.whalley@ingres.com) # Reverted to KSH # # 1.4 08-Jul-2010 (mark.whalley@ingres.com) # Disclaimer and contact details added. # # 1.5 18-Jul-2010 (mark.whalley@ingres.com) # Changed TIDYUP from 1 to 0 # # 1.6 12-Sep-2010 (mark.whalley@ingres.com) # Exclude session_id from list of long running # queries (as created by check_isa_ingres_lrq) # #---------------------------------------------------------------------------- h_prog_name=${0##*/} h_prog_version=1.6 #---------------------------------------------------------------------------- # #---------------------------------------------------------------------------- # Function: # TERMINATE - Exit the program with a Nagios return status #---------------------------------------------------------------------------- TERMINATE() { h_final_exit_status=$1 #---------------------------------------------------------------------------- # Depending on the final exit status provided to this function, translate # the status into the standard Nagios message to prefix the returned status # line #---------------------------------------------------------------------------- case "$h_final_exit_status" in 0) h_nagios_status=$h_clf_nagios_msg_ok;; 1) h_nagios_status=$h_clf_nagios_msg_warning;; 2) h_nagios_status=$h_clf_nagios_msg_critical;; 3) h_nagios_status=$h_clf_nagios_msg_unknown;; *) h_nagios_status="Unknown Nagios Status";; esac #---------------------------------------------------------------------------- # Read the data control file #---------------------------------------------------------------------------- h_dc_index=0 h_dc_message="" #---------------------------------------------------------------------------- # If there is a control data file, pull out the messages... #---------------------------------------------------------------------------- if [ -f "$h_data_control_dat" ] then while read h_dc do (( h_dc_index=h_dc_index+1 )) case "$h_dc_index" in 1) h_dc_installation_id=$h_dc;; 2) h_dc_database_name=$h_dc;; 3) h_dc_pause=$h_dc;; 4) h_dc_runtime=$h_dc;; 5) h_dc_started=$h_dc;; 6) h_dc_end=$h_dc;; 7) h_dc_final_exit_status=$h_dc;; *) h_dc_message=$h_dc_message" "$h_dc;; esac done < $h_data_control_dat printf "%s\n" "$h_nagios_status - Installation: $h_dc_installation_id, Database: $h_dc_database_name, Pause: $h_dc_pause(sec), Run Time: $h_dc_runtime(min). Last check started $h_dc_started and completed $h_dc_end. $h_dc_message. $h_clf_last_error_message" #---------------------------------------------------------------------------- # ... otherwise, I guess the monitor has not run (or has failed) #---------------------------------------------------------------------------- else printf "%s\n" "$h_nagios_status - No available results to report - $h_clf_last_error_message" fi CLF_TIDYUP 0 exit $h_final_exit_status #---------------------------------------------------------------------------- # End of Function: TERMINATE #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Function: # INITIALIZE - Set up local variables #---------------------------------------------------------------------------- INITIALIZE() { #---------------------------------------------------------------------------- # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #---------------------------------------------------------------------------- # # WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING # # PLEASE be very careful when amending this section. Much of what is set up # here is used by the common library functions to set up environments etc # for the rest of this program. # #---------------------------------------------------------------------------- # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # Enviroment setup: #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # If called from NRPE, "pwd" is set to directory of this script, in which # case use it... #---------------------------------------------------------------------------- h_prog_path=`pwd` #---------------------------------------------------------------------------- # ... check to see if the common functions is in the h_prog_path. # If it is not (typical if this was called via SSH), # then try /usr/local/nagios/libexec #---------------------------------------------------------------------------- if [ ! -f $h_prog_path/check_isa_common_library_functions ] then h_prog_path=/usr/local/nagios/libexec fi #---------------------------------------------------------------------------- # ... if we are still not able to locate the common functions, not much point # in continuing :-( #---------------------------------------------------------------------------- if [ ! -f $h_prog_path/check_isa_common_library_functions ] then h_clf_last_error_message="Unable to locate check_isa_common_library_functions in $h_prog_path" TERMINATE $h_clf_nagios_state_critical fi #---------------------------------------------------------------------------- # ... OK, lets load the ISA common library functions (CLF) #---------------------------------------------------------------------------- . $h_prog_path/check_isa_common_library_functions CLF_SETUPCOMMONVARIABLES CLF_CREATELOGFILES CLF_SETUPPATHS $h_clv_ii_system CLF_GETINGPRENV #---------------------------------------------------------------------------- # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #---------------------------------------------------------------------------- # # OK, it is generally safe to add stuff from here down - please read # WARNING above. # #---------------------------------------------------------------------------- # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # Declare local variables specific to this script (others should be declared # in the common library functions) #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # h_list_of_queries_prev_running_dat # A file containing a list of tables with overflow #---------------------------------------------------------------------------- h_list_of_queries_prev_running_dat="$h_clf_local_data_dir/$h_prog_name.$h_clf_ingprenv_ii_installation.$h_clv_database_name.queries_prev_running.DAT" h_list_of_queries_prev_running_top10="$h_clf_log_file_dir/$h_prog_name.$h_clf_ingprenv_ii_installation.$h_clv_database_name.queries_prev_running.TMP.$h_clf_pid" #---------------------------------------------------------------------------- # h_data_control_dat # A file containing a control data for the list of tables with overflow #---------------------------------------------------------------------------- h_data_control_dat="$h_clf_local_data_dir/$h_prog_name.$h_clf_ingprenv_ii_installation.$h_clv_database_name.data_control.DAT" #---------------------------------------------------------------------------- # OK, found the utils.sh, get them loaded #---------------------------------------------------------------------------- . $h_prog_path/utils.sh #---------------------------------------------------------------------------- # End of Function: INITIALIZE #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Function: # REPORT_LRQ - Report the top 10 longest running queries #---------------------------------------------------------------------------- REPORT_LRQ() { #---------------------------------------------------------------------------- # Before we go ahead and try to report on the data generated by # check_isa_ingres_lrq, just check that there IS actually a file of # data to report on... #---------------------------------------------------------------------------- if [ ! -f "$h_list_of_queries_prev_running_dat" ] then h_clf_last_error_message="The list of queries does not exist" return $h_clf_nagios_state_warning fi #---------------------------------------------------------------------------- # Load in the list of queries that have been caught running whilst we have # been monitoring. #---------------------------------------------------------------------------- set -f echo "set lockmode session where readlock=nolock \p\g /* Extend domain to all DBMS servers */ update ima_mib_objects set value = DBMSINFO('IMA_VNODE') where classid = 'exp.gwf.gwm.session.control.add_vnode' and instance = '0' and server = DBMSINFO('IMA_SERVER') \p\g /* Declare a GTT to load in the details of previously recorded queries (this of course MAY be empty if nothing was happeing on the DB) */ declare global temporary table session.queries_prev_running ( /* session_id varchar(32), */ effective_user varchar(32), session_query varchar(1000), seen_count integer ) on commit preserve rows with norecovery \p\g copy table session.queries_prev_running ( /* session_id = c0tab, */ effective_user = c0tab, session_query = c0tab, seen_count = c0nl ) from '$h_list_of_queries_prev_running_dat' \p\g /* Declare a GTT to store the 10 queries that were seen the most */ declare global temporary table session.lrq_report as select effective_user, session_query, seen_count from session.queries_prev_running on commit preserve rows with norecovery \p\g /* Now copy these top 10 out to file */ copy table session.lrq_report ( effective_user = c0tab, session_query = c0tab, seen_count = c0nl ) into '$h_list_of_queries_prev_running_top10' \p\g" > $h_clf_sql_script #---------------------------------------------------------------------------- # OK, run the query against the database #---------------------------------------------------------------------------- sql -s imadb < $h_clf_sql_script > $h_clf_sql_log CLF_CHECKCMD $? "N" "identifying queries in ${h_clv_database_name}" h_clv_deadlock_allowed="N" CLF_CHKINGERR "N" h_query_output=`cat $h_clf_sql_log` #---------------------------------------------------------------------------- # Not too sure why, but I am getting the following errors from IMA... # NEED TO INVESTIGATE - MA Whalley #---------------------------------------------------------------------------- h_query_outputerr=`echo $h_query_output | grep "E_" | grep -v E_GW8045 | grep -v E_GC0132 | grep -v E_GW8081 | grep -v E_GC0132 | wc -l` set +f #---------------------------------------------------------------------------- # If there are any E_ messages in the output, flag as CRITICAL and exit #---------------------------------------------------------------------------- if [ "$h_query_outputerr" -gt 0 ] then h_clf_last_error_message="Unable to determine top 10 LRQ: $h_query_output" return $h_clf_nagios_state_critical fi #---------------------------------------------------------------------------- # How many LRQs were reported? #---------------------------------------------------------------------------- h_noof_lrq=`wc -l < $h_list_of_queries_prev_running_top10` #---------------------------------------------------------------------------- # ... No rows, exit with OK state #---------------------------------------------------------------------------- if [ ${h_noof_lrq} -eq 0 ] then #---------------------------------------------------------------------------- # Nulled the following message, as I am now reporting this elsewhere #---------------------------------------------------------------------------- # h_clf_last_error_message="No Long Running Queries recorded" h_clf_last_error_message="" return $h_clf_nagios_state_ok fi #---------------------------------------------------------------------------- # ... Report the LRQs and exit with WARNING state #---------------------------------------------------------------------------- h_sep="" h_old_ifs=$IFS IFS=" " case ${h_noof_lrq} in 1 ) h_prefix_lrq_summary="The only query running was: " ;; 10 ) h_prefix_lrq_summary="The first 10 longest running queries were: " ;; * ) h_prefix_lrq_summary="The $h_prefix_lrq_summary queries running were: " ;; esac h_noof_queries_reported=1 h_word_user="User:" h_clf_last_error_message="${h_prefix_lrq_summary}" while read h_effective_user h_session_query h_seen_count do h_trunc_user=`echo $h_effective_user | sed "s/ *$//g"` h_trunc_query=`echo $h_session_query | sed "s/ *$//g"` h_trunc_seen=`echo $h_seen_count | sed "s/^ *//g"` (( h_noof_queries_reported=h_noof_queries_reported+1 )) if [ ${h_trunc_seen} -eq 1 ] then h_seen_message="seen once" else h_seen_message="seen ${h_trunc_seen} times" fi h_clf_last_error_message="$h_clf_last_error_message$h_sep$h_word_user $h_trunc_user Query: $h_trunc_query ($h_seen_message)" h_sep=", " done < $h_list_of_queries_prev_running_top10 IFS=$h_old_ifs return $h_clf_nagios_state_warning #---------------------------------------------------------------------------- # End of Function: REPORT_LRQ #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Function: # print_usage - print out the usage and exit #---------------------------------------------------------------------------- PRINT_USAGE() { printf "%s\n" "Usage:" printf "%s\n" " $h_prog_name" printf "%s\n" " -s|--system " printf "%s\n" " -d|--database " printf "%s\n" "" printf "%s\n" " $h_prog_name --help" printf "%s\n" " $h_prog_name --version" #---------------------------------------------------------------------------- # End of Function: PRINT_USAGE #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Function: # PRINT_HELP - print out the help and exit #---------------------------------------------------------------------------- PRINT_HELP() { PRINT_REVISION $h_prog_name $h_prog_version printf "%s\n" "" PRINT_USAGE printf "%s\n" "" printf "%s\n" "Monitor an ingres installation for long running queries" printf "%s\n" "" #---------------------------------------------------------------------------- # End of Function: PRINT_HELP #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Function: # PRINT_REVISION - print out the programs revision number #---------------------------------------------------------------------------- PRINT_REVISION() { printf "%s\n" "Program Name...: $1" printf "%s\n" "Revision.......: $2" printf "%s\n" "" #---------------------------------------------------------------------------- # End of Function: PRINT_HELP #---------------------------------------------------------------------------- } #---------------------------------------------------------------------------- # Main program #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # Process Command Line Variables (clv) #---------------------------------------------------------------------------- while [ -n "$1" ] do case "$1" in -s|--system) h_clv_ii_system=$2 shift ;; -d|--database) h_clv_database_name=$2 shift ;; -h|--help) PRINT_HELP exit 0 ;; -V|--version) PRINT_REVISION $h_prog_name $h_prog_version exit 0 ;; *) printf "%s\n" "Invalid parameter: $1" PRINT_USAGE exit 1 ;; esac shift done #---------------------------------------------------------------------------- # Check that required environment variables have been set... # ... Mandatory #---------------------------------------------------------------------------- if [ -z "$h_clv_ii_system" ] then printf "%s\n" "Ingres System (II_SYSTEM) has not been specified" PRINT_USAGE exit 1 fi if [ -z "$h_clv_database_name" ] then printf "%s\n" "Database name has not been specified" PRINT_USAGE exit 1 fi #---------------------------------------------------------------------------- # ... Optional #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # N/A #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # OK, lets do stuff #---------------------------------------------------------------------------- INITIALIZE REPORT_LRQ TERMINATE $? #---------------------------------------------------------------------------- # End of Script #----------------------------------------------------------------------------