#!/usr/bin/env python # Nimble Storage - NagiosXI Replication Health API Check Script # Tristan Self # The script utilises the Nimble Storage API to monitor the state of any lagged volumes. The user can specify when a lagged volume breaches a specific threshold in seconds to generate an error. Its normal for there to be a small amount of lag when the replication occurrs, so you set it to a threshold after which you care. For example if a volume collection normally replicates within about 60 minutes (3600 seconds), then set the threshold for 3600, that way if it goes over that you'll know. The script uses the Nimble Storage API, so setting it to check every 30 minutes is normally sufficient. import requests import sys import json from requests.packages.urllib3.exceptions import InsecureRequestWarning import argparse requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ############################################################################################# # Argument Collection ############################################################################################# # Parse the arguments passed from the command line. parser = argparse.ArgumentParser() parser.add_argument('-e','--endpointurl',help='Endpoint URL (e.g. https://arrayname.domain.com:5392)',required=True) parser.add_argument('-u','--username',help='API Username',required=True) parser.add_argument('-p','--password',help='API Password (in single quotes!)',required=True) parser.add_argument('-c','--critical',help='Critical threshold, time in a lagged state in seconds',required=True) parser.add_argument('-d','--debugmode',help='Enable debug mode',action='store_true') # Assign each arguments to the relevant variables. arguments = vars(parser.parse_args()) strEndpointURL = arguments['endpointurl'] strAPIPassword = arguments['password'] strAPIUsername = arguments['username'] intCritical = int(arguments['critical']) intDebugMode = arguments['debugmode'] ############################################################################################# # Initalise and Clean Variables ############################################################################################# strCheckOutputStatusText = "OK" intCheckOutputStatus = 0 strCheckOutputText = "" intLaggedVolCol = 0 if intDebugMode: print print "DEBUG MODE" print print "Endpoint URL:\033[0;32;40m",strEndpointURL,"\033[0m" print print ################################################################################################ # API Authentication ################################################################################################ # Get the authentication token. try: objToken = requests.post(strEndpointURL+'/v1/tokens', data = json.dumps({'data' : {'password':strAPIPassword,'username':strAPIUsername}}),verify=False) except requests.exceptions.RequestException as e: if intDebugMode: print e # Build and print to the screen the check result. print "CRITICAL - Failed to connect! Check EndpointURL, username and password." strCheckOutputStatus = 2 # Return the status to the calling program. sys.exit(strCheckOutputStatus) # Get the JSON from the request response, then extract the session token for actually making a connection. objTokenJSON = json.dumps(objToken.json(),indent=2) objTokenDict = json.loads(objTokenJSON) strAPIToken = objTokenDict['data']['session_token'] strAPITokenDict = {'X-Auth-Token':strAPIToken} ################################################################################################# # Basic Array Information ################################################################################################# # Get the Array's basic information objArrayInfo = requests.get(strEndpointURL+'/v1/arrays/detail', headers = strAPITokenDict, verify=False) objArrayInfoJSON = json.dumps(objArrayInfo.json(),indent=2) objArrayInfoDict = json.loads(objArrayInfoJSON) strArrayFullName = objArrayInfoDict.get('data')[0]['full_name'] strArraySerialNo = objArrayInfoDict['data'][0]['serial'] strArrayVersion = objArrayInfoDict['data'][0]['version'] # Get the list of the replications. objArrayVolCol = requests.get(strEndpointURL+'/v1/volume_collections/detail', headers = strAPITokenDict, verify=False) objArrayVolColJSON = json.dumps(objArrayVolCol.json(),indent=2) objArrayVolColDict = json.loads(objArrayVolColJSON) #print json.dumps(objArrayVolColDict, indent=2) # Get the total number of shelves in the array. intstartRowVolCol = objArrayVolColDict['startRow'] intendRowVolCol = objArrayVolColDict['endRow'] # This variable lists the number of shelves there are in the array. # Build the result string. strCheckOutputText = strArrayFullName + " (" + strArraySerialNo + ") - " # Step through the whole array looking for lagging volume collections. for i in range(intstartRowVolCol,intendRowVolCol,1): # Get the name of the Volume Collection strArrayVolName = objArrayVolColDict['data'][i]['name'] intLagTime = objArrayVolColDict['data'][i]['lag_time'] # Check if the lag is overthreshold, some lag is expected (during copy), so only if its a very large time (i.e. hours) do we really care and want to report. if intCritical < intLagTime: # Increment the number of lagged volumes intLaggedVolCol = intLaggedVolCol + 1 strCheckOutputText = strCheckOutputText + strArrayVolName + " (" + str(intLagTime) + " seconds lag) " # If there are volume collections that are in lagged state, report it, otherwise report OK, building the output string to return to NagiosXI. if intLaggedVolCol >= 1: strCheckOutputStatusText = "CRITICAL - " strCheckOutputText = strCheckOutputText + " - " + str(intendRowVolCol) + " volumes collections, " + str(intLaggedVolCol) + " are in a lagged state." intCheckOutputStatus = 2 else: strCheckOutputStatusText = "OK - " strCheckOutputText = strCheckOutputText + " - " + str(intendRowVolCol) + " volumes collections, " + str(intLaggedVolCol) + " are in a lagged state." intCheckOutputStatus = 0 # Output the check string, back to the calling console. print strCheckOutputStatusText + strCheckOutputText # Display the Output Status in Debug Mode if intDebugMode == 1: print print "Return Code:\033[0;33;40m", intCheckOutputStatus, "\033[0m" sys.exit(intCheckOutputStatus)