#!/bin/bash ################################################################################ # The MIT License (MIT) # # # # Copyright (c) 2016 Achim Christ # # # # Permission is hereby granted, free of charge, to any person obtaining a copy # # of this software and associated documentation files (the "Software"), to deal# # in the Software without restriction, including without limitation the rights # # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # # copies of the Software, and to permit persons to whom the Software is # # furnished to do so, subject to the following conditions: # # # # The above copyright notice and this permission notice shall be included in # # all copies or substantial portions of the Software. # # # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE# # SOFTWARE. # ################################################################################ # Name: Check IBM Storwize V7000 Unified / SONAS Performance # Author: Achim Christ - achim(dot)christ(at)gmail(dot)com # Version: 1.0 # Dependencies: openssh - OpenSSH SSH client (remote login program) # bc - An arbitrary precision calculator language # Website: https://github.com/acch/nagios-plugins # This bash script reports on various performance metrics of an IBM Storwize V7000 Unified / SONAS system, using the 'lsperfdata' CLI command. # For a list of supported metrics run the script without any commandline arguments. # The script uses the performance center service which needs to be running on all nodes of Storwize V7000 Unified / SONAS. # The plugin produces Nagios performance data so it can be graphed. # The following CLI command is used to retrieve performance data: # lsperfdata # http://www-01.ibm.com/support/knowledgecenter/STAV45/com.ibm.sonas.doc/manpages/lsperfdata.html # If the above command is unable to retrieve performance data, then one reason might be that performance data collection is not running, which can be verified and fixed with this CLI command: # cfgperfcenter # http://www-01.ibm.com/support/knowledgecenter/STAV45/com.ibm.sonas.doc/manpages/cfgperfcenter.html # The actual code is managed in the following GitHub rebository - please use the Issue Tracker to ask questions, report problems or request enhancements. # https://github.com/acch/nagios-plugins # Disclaimer: This sample is provided 'as is', without any warranty or support. It is provided solely for demonstrative purposes - the end user must test and modify this sample to suit his or her particular environment. This code is provided for your convenience, only - though being tested, there's no guarantee that it doesn't seriously break things in your environment! If you decide to run it, you do so on your own risk! # The script requires SSH Public Key Authentication for connecting to the Storwize V7000 Unified / SONAS system. SSH Public Key Authentication needs to be set up first, before running the script. To test your SSH configuration, try to login to Storwize V7000 Unified / SONAS via SSH from the Nagios server as the Nagios user - if this works without prompting you for a password you seem to have properly configured Public Key Authentication. If you get a "Permission denied" error when running the script, the most likely reason for that is Public Key Authentication not being configured correctly for the Nagios user (by default called 'nagios'). # It is strongly recommended to create a dedicated read-only Storwize V7000 Unified / SONAS user to be used by this script. This eases problem determination, allows for proper audit tracing and helps avoiding undesired side-effects. Also, it eliminates the risk of script errors having an impact on your actual production environment... # To create a read-only user 'nagios' with password 'secret' on Storwize V7000 Unified / SONAS, run the following commands as the Nagios operating-system user (by default called 'nagios', too): # ssh admin@ mkuser nagios -p secret -g Monitor # ssh admin@ chuser nagios -k \"`cat ~/.ssh/id_rsa.pub`\" # Note that you may need to modify the last command to point to the actual location of your SSH public key file used for authentication # You may want to define the following Nagios constructs to use this script: # define command{ # command_name check_sonas_perfdata # command_line /path/to/check_sonas_perfdata.sh -H $HOSTADDRESS$ -u $ARG1$ -m $ARG2$ -w $ARG3$ -c $ARG4$ # } # define service{ # host_name # service_description CPU Utilization # check_command check_sonas_perfdata!nagios!cpu_utilization!80!90 # } # define service{ # host_name # service_description CPU IO Wait # check_command check_sonas_perfdata!nagios!cpu_iowait!3!5 # } # define service{ # host_name # service_description NETWORK Throughput # check_command check_sonas_perfdata!nagios!public_network!100000000!200000000 # } # define service{ # host_name # service_description GPFS Throughput # check_command check_sonas_perfdata!nagios!gpfs_throughput!100000000!200000000 # } # define service{ # host_name # service_description GPFS Operations # check_command check_sonas_perfdata!nagios!operations!100000!200000 # } # define service{ # host_name # service_description GPFS Latency # check_command check_sonas_perfdata!nagios!latency!100!200 # } # Version History: # 1.0 15.6.2016 Initial Release ##################### ### Configuration ### ##################### # Modify the following filenames to match your environment # Path to the SSH private key file used for authentication: (create a private/public key pair with the 'ssh-keygen' command) identity_file="$HOME/.ssh/id_rsa" # Be sure this is readable by Nagios user! # Path to a temporary file holding the remote command output while it is being parsed by the script: tmp_file="/tmp/check_sonas_perfdata_$RANDOM.tmp" # Be sure that this is writable by Nagios user! #################################### ### Do not edit below this line! ### #################################### error_usage () { echo "Usage: $0 -H -u -m -w -c " echo "Supported metrics: [unit]" echo " cpu_utilization [%]" echo " cpu_iowait [%]" echo " public_network [Bps]" echo " gpfs_throughput [Bps]" echo " operations [ops]" echo " latency [ms]" exit 3 } error_login () { echo "Error executing remote command - [$rsh] `cat $tmp_file`" rm $tmp_file exit 3 } error_response () { echo "Error parsing remote command output: $*" rm $tmp_file exit 3 } # Check number of commandline options if [ $# -ne 10 ]; then error_usage; fi # Check commandline options while getopts 'H:u:m:w:c:' OPT; do case $OPT in H) hostaddress=$OPTARG ;; u) username=$OPTARG ;; m) metric=$OPTARG ;; w) warn_thresh=$OPTARG ;; c) crit_thresh=$OPTARG ;; *) error_usage ;; esac done # Check for mandatory options if [ -z "$hostaddress" ] || [ -z "$username" ] || [ -z "$metric" ] || [ -z "$warn_thresh" ] || [ -z "$crit_thresh" ]; then error_usage; fi # Check if thresholds are numbers if ! [[ "$warn_thresh" =~ ^[[:digit:]]+$ ]] || ! [[ "$crit_thresh" =~ ^[[:digit:]]+$ ]]; then error_usage; fi ################# # Sanity checks # ################# # Check for dependencies if [ ! -x /usr/bin/ssh ] then echo "'openssh' not found - please install it!" exit 3 fi if [ ! -x /usr/bin/bc ] then echo "'bc' not found - please install it!" exit 3 fi # Check if identity file is readable if [ ! -r "$identity_file" ] then echo "${identity_file} is not readable - please adjust its path!" exit 3 fi # Check if temporary file is writable if ! touch $tmp_file 2> /dev/null then echo "${tmp_file} is not writable - please adjust its path!" exit 3 fi # Compile SSH command using commandline options rsh="/usr/bin/ssh \ -o PasswordAuthentication=no \ -o PubkeyAuthentication=yes \ -o StrictHostKeyChecking=no \ -o ConnectTimeout=10 \ -i $identity_file \ $username@$hostaddress" # Initialize return code return_code=0 return_status="OK" return_metric="PERFDATA" # Initialize performance data and output perfdata="" output="" # Initialize counter count_metric_1=0 count_metric_2=0 # Prepare nodename array declare -A nodenames ####################### # Retrieve node names # ####################### # Execute remote command $rsh "lsnode -v -Y" &> $tmp_file # Check SSH return code if [ $? -ne 0 ]; then error_login; fi # Remove header from remote command output sed '/HEADER/d' -i $tmp_file # Parse remote command output while read line do # Remember node's IP and associated name nodenames["$(echo $line | cut -d ':' -f 8)"]="$(echo $line | cut -d ':' -f 7)" done < $tmp_file ############################# # Retrieve performance data # ############################# # Query multiple metrics if required repeat=1 while [ "$repeat" -gt 0 ] do query="" case "$metric" in "cpu_utilization") query="lsperfdata -g cpu_idle_usage -t hour -n all" # Retrieves the statistics for the % of CPU spent idle on each of the nodes ;; "cpu_iowait") query="lsperfdata -g cpu_iowait_usage -t hour -n all" # Retrieves the statistics for the % CPU spent for waiting for IO to complete on each of the nodes ;; "public_network") if [ -z "$perfdata" ] then # Repeat twice repeat=2 # First repeat query="lsperfdata -g public_network_bytes_received -t hour -n all" # Retrieves the total number of bytes received on all the client network interface of the nodes else # Second repeat query="lsperfdata -g public_network_bytes_sent -t hour -n all" # Retrieves the total number of bytes sent on all the client network interface of the nodes fi ;; "gpfs_throughput") query="lsperfdata -g cluster_throughput -t hour" # Retrieves the number of bytes read and written across all the filesystems on all the nodes of the GPFS cluster ;; "operations") if [ -z "$perfdata" ] then # Repeat twice repeat=2 # First repeat query="lsperfdata -g cluster_open_close_operations -t hour" # Retrieves the number of file open and close operations across all the filesystems on all the nodes of the GPFS cluster else # Second repeat query="lsperfdata -g cluster_read_write_operations -t hour" # Retrieves the number of file read and write operations across all the filesystems on all the nodes of the GPFS cluster fi ;; "latency") if [ -z "$perfdata" ] then # Repeat twice repeat=2 # First repeat query="lsperfdata -g cluster_open_close_latency -t hour" # Retrieves the latency of file open and close operations across all the filesystems on all the nodes of the GPFS cluster else # Second repeat query="lsperfdata -g cluster_read_write_latency -t hour" # Retrieves the latency of file read and write operations across all the filesystems on all the nodes of the GPFS cluster fi ;; # Also available: # client_throughput Retrieves the total bytes received and total bytes sent across all the client network interface on all the interface nodes. Timeperiod is the only parameter for this graph. # cluster_create_delete_latency Retrieves the latency of the file create and delete operations across all the filesystems on all the nodes of the GPFS cluster. Timeperiod is the only mandatory parameter for this graph. # cluster_create_delete_operations Retrieves the number of file create and delete operations across all the filesystems on all the nodes of the GPFS cluster. Timeperiod is the only mandatory parameter for this graph. # Check not implemented *) error_usage ;; esac # Execute remote command $rsh $query &> $tmp_file # Check SSH return code if [ $? -ne 0 ]; then error_login; fi # Remove success status from remote command output sed '/EFSSG1000I/d' -i $tmp_file # Check for performance center errors if grep -q 'EFSSG0002I' $tmp_file then echo "Error collecting performance data - check if performance center service is running using 'cfgperfcenter'" rm $tmp_file exit 3 fi # Extract header and performance data from output header_raw=$(cat "$tmp_file" | head -n 1 | cut -d ',' -f 3-) perfdata_raw=$(cat "$tmp_file" | tail -n 1 | cut -d ',' -f 3- | sed 's/,/ /g') # Check extracted performance data if [ -z "$perfdata_raw" ] then error_response $(cat "$tmp_file") fi # Initialize counter num_nodes=0 # Compute performance data and output for i in $perfdata_raw do # Extract node's IP from header nodeindx=$(( num_nodes * 2 + 1 )) nodeip=$(echo $header_raw | cut -d ',' -f $nodeindx | sed 's/\"//') if [ -n "$nodeip" ] then # Lookup node's name nodename=${nodenames["$nodeip"]} fi # Count number of nodes (( num_nodes += 1 )) case "$metric" in "cpu_utilization") # Calculate utilization from %idle utilization=$(echo "100-${i}" | bc) # Concatenate performance data per node perfdata="${perfdata} ${nodename}=${utilization}%;${warn_thresh};${crit_thresh};0;100" # Calculate max utilization for output if [ $(echo "${utilization}>${count_metric_1}" | bc) -eq 1 ] then count_metric_1=$utilization fi # Produce output output="Max. CPU utilization ${count_metric_1} %" # Calculate average utilization for output #sum_metric=$(echo "${sum_metric}+${utilization}" | bc) #output=$(echo "${sum_metric}/${num_nodes}" | bc) #output="Max. CPU utilization ${output}%" # Check if metric is above threshold if [ $(echo "${utilization}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${utilization}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="CPU" ;; "cpu_iowait") # Concatenate performance data per node perfdata="${perfdata} ${nodename}=${i}%;${warn_thresh};${crit_thresh};0;" # Calculate max utilization for output if [ $(echo "${i}>${count_metric_1}" | bc) -eq 1 ] then count_metric_1=$i fi # Produce output output="Max. IO Wait ${count_metric_1} %" # Calculate average utilization for output #sum_metric=$(echo "${sum_metric}+${i}" | bc) #output=$(echo "${sum_metric}/${num_nodes}" | bc) #output="Max. IO wait ${output}%" # Check if metric is above threshold if [ $(echo "${i}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${i}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="CPU" ;; "public_network") # Report on send and receive throughput if [ "$repeat" -eq 2 ] then # First repeat - concatenate receive performance per node perfdata="${perfdata} ${nodename}_received=${i}B/s;${warn_thresh};${crit_thresh};0;" # Sum up throughput for output count_metric_1=$(echo "${count_metric_1}+${i}" | bc) else # Second repeat - concatenate send performance per node perfdata="${perfdata} ${nodename}_sent=${i}B/s;${warn_thresh};${crit_thresh};0;" # Sum up throughput for output count_metric_2=$(echo "${count_metric_2}+${i}" | bc) fi # Produce output output="Total received $(echo "scale=2; ${count_metric_1}/1024/1024" | bc) MB/s sent $(echo "scale=2; ${count_metric_2}/1024/1024" | bc) MB/s" # Check if metric is above threshold if [ $(echo "${i}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${i}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="NETWORK" ;; "gpfs_throughput") # Report on read and write throughput if [ -z "$perfdata" ] then # First metric - concatenate read performance perfdata=" read=${i}B/s;${warn_thresh};${crit_thresh};0;" # Produce output output="Total read $(echo "scale=2; ${i}/1024/1024" | bc) MB/s" else # Second metric - concatenate write performance perfdata="${perfdata} write=${i}B/s;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} write $(echo "scale=2; ${i}/1024/1024" | bc) MB/s" fi # Check if metric is above threshold if [ $(echo "${i}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${i}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="GPFS" ;; "operations") # Report on open/close and read/write operations if [ "$repeat" -eq 2 ] then # First repeat - open/close operations if [ -z "$perfdata" ] then # First metric - concatenate open operations perfdata=" open=${i}OP/s;${warn_thresh};${crit_thresh};0;" # Produce output output="Total open $(echo "scale=2; ${i}/1" | bc) OP/s" else # Second metric - concatenate close operations perfdata="${perfdata} close=${i}OP/s;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} close $(echo "scale=2; ${i}/1" | bc) OP/s" fi else # Second repeat - read/write operations if [ "$count_metric_1" -eq 0 ] then count_metric_1=1 # First metric - concatenate read operations perfdata="${perfdata} read=${i}OP/s;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} read $(echo "scale=2; ${i}/1" | bc) OP/s" else # Second metric - concatenate write operations perfdata="${perfdata} write=${i}OP/s;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} write $(echo "scale=2; ${i}/1" | bc) OP/s" fi fi # Check if metric is above threshold if [ $(echo "${i}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${i}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="OPERATIONS" ;; "latency") # Report on open/close and read/write latency if [ "$repeat" -eq 2 ] then # First repeat - open/close latency if [ -z "$perfdata" ] then # First metric - concatenate open latency perfdata=" open=${i}ms;${warn_thresh};${crit_thresh};0;" # Produce output output="Total open $(echo "scale=2; ${i}/1" | bc) ms" else # Second metric - concatenate close latency perfdata="${perfdata} close=${i}ms;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} close $(echo "scale=2; ${i}/1" | bc) ms" fi else # Second repeat - read/write latency if [ "$count_metric_1" -eq 0 ] then count_metric_1=1 # First metric - concatenate read latency perfdata="${perfdata} read=${i}ms;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} read $(echo "scale=2; ${i}/1" | bc) ms" else # Second metric - concatenate write latency perfdata="${perfdata} write=${i}ms;${warn_thresh};${crit_thresh};0;" # Produce output output="${output} write $(echo "scale=2; ${i}/1" | bc) ms" fi fi # Check if metric is above threshold if [ $(echo "${i}>=${crit_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 2 ] then return_code=2 return_status="CRITICAL" elif [ $(echo "${i}>=${warn_thresh}" | bc) -eq 1 ] && [ "$return_code" -lt 1 ] then return_code=1 return_status="WARNING" fi # Report metric in output return_metric="LATENCY" ;; esac done # for i in $perfdata_raw # Count repeats (( repeat -= 1 )) done # while [ $repeat -gt 0 ] # Cleanup rm $tmp_file # Produce Nagios output echo "${return_metric} ${return_status} - ${output} |${perfdata}" exit $return_code