/****************************************************************************** * Nagios check_lpar_cpu plugin * * License: GPL * Author: Konstantin Reichert * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * Please visit also http://www.ibm.com/developerworks/wikis/display/WikiPtype/ryo * Parts of the code within this plugin come from there. * ******************************************************************************/ #include // C programming language subroutines that execute in user space and extract data from the perfstat kernel extension (kex) to obtain statistics. This API is available in AIX 5L. #include // Header to get the basename of the file. #include // C library to perform Input/Output operations #include // C programming language which includes functions involving memory allocation, process control, conversions and others. #include #include #include #include #include // C Standard Library to declare time and date functions that provide standardized access to time/date manipulation and formatting. #include // Defines miscellaneous symbolic constants and types, and declares miscellaneous functions. #include using namespace std; /****************************************************************************** * Variables ******************************************************************************/ string Description = "This plugin checks lpar cpu and outputs same values as lparstat command.\nThresholds can be set for Idle, App and Backward values, but feel free to implement other thresholds.\n"; string Author = "Konstantin Reichert "; string Date = "2010/08/04 "; string Version = "0.6 "; #define XINTFRAC ((double)(_system_configuration.Xint)/(double)(_system_configuration.Xfrac)) #define HTIC2SEC(x) ((double)x * XINTFRAC)/(double)1000000000.0 static int firstiteration = 1; static u_longlong_t last_time_base; static u_longlong_t last_pcpu_user, last_pcpu_sys, last_pcpu_idle, last_pcpu_wait; static u_longlong_t last_lcpu_user, last_lcpu_sys, last_lcpu_idle, last_lcpu_wait; static u_longlong_t last_phint = 0, last_vcsw = 0, last_pit = 0; static u_longlong_t last_runque, last_swpque; int state; int warn_cnt = 0; int crit_cnt = 0; int unkn_cnt = 0; char *ch; double app_warn = 20; double app_crit = 10; double back_warn = 4; double back_crit = 5; double idle_warn = 20; double idle_crit = 10; static int perfdata = 0; /****************************************************************************** * Print Usage Info ******************************************************************************/ void print_usage(char *basename) { cout << "Usage:\n\t" << basename << " [-p] [-a] , [-i] , [-b] ," << endl; cout << "\t" << basename << " -h" << endl; } /****************************************************************************** * Print Help ******************************************************************************/ void print_help(char *basename) { cout << setfill('-') << setw(50) << "-" < warning) { return state = 0; } else if((value <= warning) && (value >= critical)) { warn_cnt++; return state = 1; } else if(value <= critical) { crit_cnt++; return state = 2; } else { unkn_cnt++; return state = 3; } } /****************************************************************************** * Check Thresholds for Greater Than WARN/CRIT-Value ******************************************************************************/ void check_thresh_GT(double warning, double critical, int opt) { if (warning > critical) { printf("Warning-Value for -%c Options must be LESS than Critical-Value!!!\n", opt); print_usage(basename(ch)); exit(3); } } int check_thresh_GT(double value, double warning, double critical) { if(value < warning) { return state = 0; } else if((value >= warning) && (value <= critical)) { warn_cnt++; return state = 1; } else if(value >= critical) { crit_cnt++; return state = 2; } else { unkn_cnt++; return state = 3; } } /****************************************************************************** * Get CPU Total and LPAR Values from AIX-Kernel ******************************************************************************/ /* Save the current values for the next iteration */ void save_last_values(perfstat_cpu_total_t *cpustats, perfstat_partition_total_t *lparstats) { last_vcsw = lparstats->vol_virt_cswitch + lparstats->invol_virt_cswitch; last_time_base = lparstats->timebase_last; last_phint = lparstats->phantintrs; last_pit = lparstats->pool_idle_time; last_pcpu_user = lparstats->puser; last_pcpu_sys = lparstats->psys; last_pcpu_idle = lparstats->pidle; last_pcpu_wait = lparstats->pwait; last_lcpu_user = cpustats->user; last_lcpu_sys = cpustats->sys; last_lcpu_idle = cpustats->idle; last_lcpu_wait = cpustats->wait; last_runque = cpustats->runque; last_swpque = cpustats->swpque; } /* Gather and display lpar utilization metrics */ void check_lpar_util() { u_longlong_t dlt_pcpu_user, dlt_pcpu_sys, dlt_pcpu_idle, dlt_pcpu_wait; // Physical CPU Values u_longlong_t dlt_lcpu_user, dlt_lcpu_sys, dlt_lcpu_idle, dlt_lcpu_wait; // Logical CPU Values u_longlong_t dlt_runque, dlt_swpque; u_longlong_t vcsw, lcputime, pcputime; u_longlong_t entitled_purr, unused_purr; u_longlong_t delta_purr, delta_time_base; double phys_proc_consumed, entitlement, percent_ent, delta_sec; double app, lbusy, vcsw_now, phint, app_in_pct; perfstat_partition_total_t lparstats; perfstat_cpu_total_t cpustats; /* retrieve the metrics */ if (!perfstat_partition_total((perfstat_id_t*)NULL, &lparstats, sizeof(perfstat_partition_total_t), 1)) { perror("perfstat_partition_total"); exit(-1); } if (!perfstat_cpu_total((perfstat_id_t*)NULL, &cpustats, sizeof(perfstat_cpu_total_t), 1)) { perror("perfstat_cpu_total"); exit(-1); } /* first iteration, we only read the data and save the data */ if (firstiteration) { firstiteration = 0; save_last_values(&cpustats, &lparstats); return; } dlt_pcpu_user = lparstats.puser - last_pcpu_user; dlt_pcpu_sys = lparstats.psys - last_pcpu_sys; dlt_pcpu_idle = lparstats.pidle - last_pcpu_idle; dlt_pcpu_wait = lparstats.pwait - last_pcpu_wait; delta_purr = pcputime = dlt_pcpu_user + dlt_pcpu_sys + dlt_pcpu_idle + dlt_pcpu_wait; dlt_lcpu_user = cpustats.user - last_lcpu_user; dlt_lcpu_sys = cpustats.sys - last_lcpu_sys; dlt_lcpu_idle = cpustats.idle - last_lcpu_idle; dlt_lcpu_wait = cpustats.wait - last_lcpu_wait; lcputime = dlt_lcpu_user + dlt_lcpu_sys + dlt_lcpu_idle + dlt_lcpu_wait; entitlement = (double)lparstats.entitled_proc_capacity / 100.0 ; dlt_runque = cpustats.runque - last_runque; dlt_swpque = cpustats.swpque - last_swpque; delta_time_base = lparstats.timebase_last - last_time_base; if (lparstats.type.b.shared_enabled) { entitled_purr = delta_time_base * entitlement; if (entitled_purr < delta_purr) { /* when above entitlement, use consumption in percentages */ entitled_purr = delta_purr; } unused_purr = entitled_purr - delta_purr; /* distribute unused purr in wait and idle proportionally to logical wait and idle */ dlt_pcpu_wait += unused_purr * ((double)dlt_lcpu_wait / (double)(dlt_lcpu_wait + dlt_lcpu_idle)); dlt_pcpu_idle += unused_purr * ((double)dlt_lcpu_idle / (double)(dlt_lcpu_wait + dlt_lcpu_idle)); pcputime = entitled_purr; } /* Physical Processor Utilization */ double user = ((double)dlt_pcpu_user * 100.0 / (double)pcputime); double sys = ((double)dlt_pcpu_sys * 100.0 / (double)pcputime); double wait = ((double)dlt_pcpu_wait * 100.0 / (double)pcputime); double idle = ((double)dlt_pcpu_idle * 100.0 / (double)pcputime); phys_proc_consumed = (double)delta_purr / (double)delta_time_base; percent_ent = (double)((phys_proc_consumed / entitlement) * 100); /* Special Check for CPU including APP and Idle */ if (lparstats.type.b.shared_enabled) { if (lparstats.type.b.pool_util_authority) { app = ((double)(lparstats.pool_idle_time - last_pit) / (XINTFRAC*(double)delta_time_base)); app_in_pct = ((double)(app * 100 / lparstats.phys_cpus_pool)); if (check_thresh_LT(idle, idle_warn, idle_crit) != 0 && check_thresh_LT(app_in_pct, app_warn, app_crit) != 0 || percent_ent >= 950.0) { printf("App = %1.2f (%1.2f%%) (WARNING < %1.1f%% CRITICAL < %1.1f%%) AND Idle = %1.1f%% (WARNING < %1.1f%% CRITICAL < %1.1f%%) AND Entc = %1.1f%%\n", app, app_in_pct, app_warn, app_crit, idle, idle_warn, idle_crit, percent_ent); } else { printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%%\n", user, sys, wait, idle); warn_cnt = 0; crit_cnt = 0; unkn_cnt = 0; } } else { printf("'Allow performance information collection.' must be set in HMC for the monitored LPAR."); exit (3); } } else { if (check_thresh_LT(idle, idle_warn, idle_crit) != 0) { printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%% (WARNING < %1.1f%% CRITICAL < %1.1f%%)\n", user, sys, wait, idle, idle_warn, idle_crit); } else { printf("User = %1.1f%%, Sys = %1.1f%%, Wait = %1.1f%%, Idle = %1.1f%%\n", user, sys, wait, idle); } } printf("\n"); printf("### Detail Information ###\n"); if (check_thresh_GT(dlt_swpque, back_warn, back_crit) != 0) { printf("- Backwards = %llu (WARNING < %1.1f CRITICAL < %1.1f)\n", dlt_swpque, back_warn, back_crit); } else { printf("- Backwards = %llu\n", dlt_swpque); } printf("- RunQueue = %llu\n", dlt_runque); if (lparstats.type.b.shared_enabled) { /* Available Pool Processor (app) */ if (lparstats.type.b.pool_util_authority) { app = ((double)(lparstats.pool_idle_time - last_pit) / (XINTFRAC*(double)delta_time_base)); app_in_pct = ((double)(app * 100 / lparstats.phys_cpus_pool)); if (check_thresh_LT(app_in_pct, app_warn, app_crit) != 0) { printf("- App = %1.2f (%1.2f%%) (WARNING < %1.1f%% CRITICAL < %1.1f%%)\n", app, app_in_pct, app_warn, app_crit); } else { printf("- App = %1.2f\n", app); } } printf("\n"); /* Print of Physical Processor Consumed */ printf("- Physc = %1.2f\n", (double)phys_proc_consumed); /* Print of Percentage of Entitlement Consumed */ printf("- Entc = %1.1f%%\n", percent_ent); /* Logical Processor Utilization */ lbusy = ((double)(dlt_lcpu_user + dlt_lcpu_sys) * 100.0 / (double)lcputime); printf("- Lbusy = %1.1f\n", lbusy); /* Virtual CPU Context Switches per second */ vcsw = lparstats.vol_virt_cswitch + lparstats.invol_virt_cswitch; delta_sec = HTIC2SEC(delta_time_base); vcsw_now = ((double)(vcsw - last_vcsw) / delta_sec); printf("- Vcsw = %1.0f\n", vcsw_now); /* Phantom Interrupts per second */ phint = ((double)(lparstats.phantintrs - last_phint) / delta_sec); printf("- Phint = %1.0f\n", phint); } printf("\n"); printf("### %s running at %llu MHz ###\n", cpustats.description, cpustats.processorHZ / 1000000); printf("- CPUs Configured: %d\n", cpustats.ncpus_cfg); printf("- CPUs Active: %d", cpustats.ncpus); if(perfdata) { printf(" | Idle=%1.1f%%;%1.1f;%1.1f Backwards=%llu;%1.f;%1.f App=%1.2f;%1.2f;%1.2f RunQueue=%llu User=%1.1f%% Sys=%1.1f%% Wait=%1.1f%% Physc=%1.2f Entc=%1.1f%% Lbusy=%1.1f Vcsw=%1.0f Phint=%1.0f", idle, idle_warn, idle_crit, dlt_swpque, back_warn, back_crit, app, app_warn * lparstats.phys_cpus_pool / 100, app_crit * lparstats.phys_cpus_pool / 100, dlt_runque, user, sys, wait, phys_proc_consumed, percent_ent, lbusy, vcsw_now, phint); } printf("\n"); save_last_values(&cpustats, &lparstats); } /****************************************************************************** * Main ******************************************************************************/ int main(int argc, char *argv[]) { ch = argv[0]; // get the name/path of/to the check int arguments; state = 0; if (argc <= 1) { print_usage(basename(ch)); return 2; } while ((arguments = getopt (argc, argv, ":a:b:hi:p")) != -1) { switch (arguments) { case 'a': app_warn = atoi (strtok(optarg, ",")); app_crit = atoi (strtok(NULL, ",")); check_thresh_LT(app_warn, app_crit, optopt); break; case 'b': back_warn = atoi (strtok(optarg, ",")); back_crit = atoi (strtok(NULL, ",")); check_thresh_GT(back_warn, back_crit, optopt); break; case 'h': print_help(basename(ch)); break; case 'i': idle_warn = atoi (strtok(optarg, ",")); idle_crit = atoi (strtok(NULL, ",")); check_thresh_LT(idle_warn, idle_crit, optopt); break; case 'p': perfdata = 1; break; case ':': fprintf (stderr, "Option -%c requires an argument.\n", optopt); break; case '?': fprintf (stderr, "Unknown -%c argument.\n", optopt); break; default: abort(); } } for (int i = 0 ; i < 2 ; i++) { check_lpar_util(); sleep(1); } if (warn_cnt == 0 && crit_cnt == 0) { /* printf ("OK\n"); */ state = 0; } else if (warn_cnt >= 0 && crit_cnt == 0) { /* printf ("WARN\n"); */ state = 1; } else if (crit_cnt > 0 || crit_cnt >= warn_cnt) { /* printf ("CRIT\n"); */ state = 2; } else { /* printf ("UNKNOWN\n"); */ state = 3; } /* * printf ("WARN: %i\n", warn_cnt); * printf ("CRIT: %i\n", crit_cnt); * printf ("UNKN: %i\n", unkn_cnt); * printf ("\n"); */ return state; }