#! /bin/sh # MIT License # # Copyright (c) 2016-2021 Josef Friedrich # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # OS has to be defined at the very start because it is used in the date # functions below OS=$(uname) ######################################################################## # Date functions ######################################################################## # This date function must be placed on the top of this file because # they are used in some global variables. # to_year ### ## # Get the four digit year integer from now. # # Return: # The current 4 digit year. ## _now_to_year() { date +%Y } ## # Convert a date in the format YYYY-MM-DD to a four digit year integer. # # Parameters: # a date in the format YYYY-MM-DD # # Return: # four digit year integer ## _date_to_year() { local OPTIONS if [ "$OS" = 'Linux' ]; then OPTIONS="--date $1" # FreeBSD, Darwin else OPTIONS="-j -f %Y-%m-%d $1" fi date $OPTIONS +%Y } # to_datetime ### ## # Convert a UNIX timestamp to a datetime string. # # Parameters: # UNIX timestamp # # Return: # %Y-%m-%d.%H:%M:%S ## _timestamp_to_datetime() { local OPTIONS if [ "$OS" = 'Linux' ]; then OPTIONS="--date @$1" # FreeBSD, Darwin else OPTIONS="-j -f %s $1" fi date $OPTIONS +%Y-%m-%d.%H:%M:%S } # to_timestamp ### ## # Get the current UNIX timestamp. # # Return: # The current UNIX timestamp ## _now_to_timestamp() { date +%s } ## # Convert a datetime in the ctime format to a Unix timestamp. # ctime is a textual representation of a datetime derived from the # c function ctime (https://en.cppreference.com/w/c/chrono/ctime). # The ctime string has the following format: Www Mmm dd hh:mm:ss yyyy, # where Www is the weekday, Mmm the month in letters, dd the day of the # month, hh:mm:ss the time, and yyyy the year. # # see https://www.freebsd.org/cgi/man.cgi?query=strftime&sektion=3 # %c is replaced by national representation of time and date. # # Parameters: # $1: Www Mmm dd hh:mm:ss yyyy # # Return: # UNIX timestamp ## _ctime_to_timestamp() { local OPTIONS if [ "$OS" = 'Linux' ]; then OPTIONS='--date' # FreeBSD, Darwin else OPTIONS='-j -f %c' fi date $OPTIONS "$1" +%s } ######################################################################## # Global variables ######################################################################## PROJECT_PAGES='https://github.com/Josef-Friedrich/check_zpool_scrub https://exchange.icinga.com/joseffriedrich/check_zpool_scrub https://exchange.nagios.org/directory/Plugins/System-Metrics/File-System/check_zpool_scrub/details' VERSION=2.0 FIRST_RELEASE=2016-09-08 SHORT_DESCRIPTION="Monitoring plugin to check how long ago the last \ ZFS scrub was performed." USAGE="check_zpool_scrub v$VERSION Copyright (c) $(_date_to_year $FIRST_RELEASE)-$(_now_to_year) \ Josef Friedrich $SHORT_DESCRIPTION Usage: check_zpool_scrub Options: -c, --critical=OPT_CRITICAL Interval in seconds for critical state. -p,--pool=OPT_POOL Name of the pool. If this option is omitted all pools are checked. -h, --help Show this help. -s, --short-description Show a short description / summary. -v, --version Show the version number. -w, --warning=OPT_WARNING Interval in seconds for warning state. Must be lower than -c. Performance data: POOL is the name of the pool - POOL_last_ago Time interval in seconds for last scrub. - POOL_progress Percent 0 - 100 - POOL_speed MB per second. - POOL_time Time to go in seconds. Details about the implementation of this monitoring plugin: This monitoring plugin grabs the last scrub date from the command 'zpool status POOL'. " # The state the plugin exits with STATE= # Exit codes STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 # Message of one pool MESSAGE= # Performance data of one pool PERFORMANCE_DATA= # Options from the command line interface OPT_POOL= # 1 month 60*60*24*31 OPT_WARNING=2678400 # 2 month 60*60*24*31*2 OPT_CRITICAL=5356800 # To avoid that the command 'zpool status pool' is called more than once CURRENT_POOL= ZPOOL_STATUS_STDOUT= ZPOOL_STATUS_EXIT_CODE= _getopts() { local OPT while getopts ':c:hp:svw:-:' OPT ; do case $OPT in c) OPT_CRITICAL=$OPTARG ;; h) echo "$USAGE" ; exit 0 ;; p) OPT_POOL="$OPTARG" ;; s) echo "$SHORT_DESCRIPTION" ; exit 0 ;; v) echo "$VERSION" ; exit 0 ;; w) OPT_WARNING=$OPTARG ;; \?) echo "Invalid option “-$OPTARG”!" >&2 exit 2 ;; :) echo "Option “-$OPTARG” requires an argument!" >&2 exit 3 ;; -) LONG_OPTARG="${OPTARG#*=}" case $OPTARG in critical=?*) OPT_CRITICAL=$LONG_OPTARG ;; help) echo "$USAGE" exit 0 ;; pool=?*) OPT_POOL="$LONG_OPTARG" ;; short-description) echo "$SHORT_DESCRIPTION" exit 0 ;; version) echo "$VERSION" ; exit 0 ;; warning=?*) OPT_WARNING=$LONG_OPTARG ;; help*|short-description*|version*) echo "No argument allowed for the option “--$OPTARG”!" >&2 exit 4 ;; critical*|pool*|warning*) echo "Option “--$OPTARG” requires an argument!" >&2 exit 3 ;; '') # "--" terminates argument processing break ;; *) echo "Invalid option “--$OPTARG”!" >&2 exit 2 ;; esac ;; esac done } ## # Detect the date of the last scrub. # # Grab the date string of the last scrub from the command 'zpool # status'. Grabing the date from 'zpool history' is very slow on old # pools with a lot of history. # # /* If there's never been a scan, there's not much to say. */ "none # requested\n" -> scan: none requested # # /* Scan is finished or canceled. */ "scrub repaired %s in %lluh%um # with %llu errors on %s" -> scan: scrub repaired 0 in 0h0m with 0 # errors on Mon Aug 6 16:30:52 2018 # # "resilvered %s in %lluh%um with %llu errors on %s" # # "scrub canceled on %s" # # "resilver canceled on %s" # # /* Scan is in progress. */ # # "scrub in progress since %s" # # "resilver in progress since %s" # # Parameters: # $1: The name of the pool. # # Return: # A UNIX timestamp ## _grab_last_scrub_timestamp() { local CTIME CTIME="$(_get_zpool_status_stdout "$1" | grep ' scan: ' | sed -n -E "s/^.*(canceled on|in progress since|errors on) (.*)$/\2/p")" if [ -n "$CTIME" ]; then _ctime_to_timestamp "$CTIME" fi } ## # Assemble the performance data of one pool. # # Parameters: # $1: POOL # $2: LAST_AGO # $3: PROGRESS # $4: SPEED # $5: TIME ## _performance_data_one_pool() { local POOL="$1_" echo "\ ${POOL}last_ago=$2s;$OPT_WARNING;$OPT_CRITICAL;0 \ ${POOL}progress=$3%;;;0;100 \ ${POOL}speed=$4 \ ${POOL}time=$5s" } ## # Grab the scrub progress from the 'zpool status' output. # # Parameters: # $1: The name of the pool. # # Return: # A floating point number from 0 to 100 that represents the progress # (for example '85.3'). ## _grab_progress() { local STDOUT STDOUT="$(_get_zpool_status_stdout "$1" | grep ' done' | sed -n -E "s/^.*, ([[:digit:]]{1,3}[,\.][[:digit:]]{1,2}%) done.*$/\1/p")" if [ -n "$STDOUT" ]; then echo "$STDOUT" | sed 's/%//' | tr ',' '.' else echo 100 fi } ## # Grab the scrub speed from the 'zpool status' output. # # Parameters: # $1: The name of the pool. # # Return: # A floating point number like '57.4' or '1.9' in M/s. ## _grab_speed() { local SPEED UNIT SPEED="$(_get_zpool_status_stdout "$1" | grep -E -o '[[:digit:]\.,]*[[:alpha:]]*/s')" SPEED="$(echo $SPEED | awk '{print $1}')" if [ -n "$SPEED" ]; then SPEED=$(echo "$SPEED" | sed 's#/s##' | tr , .) UNIT=$(echo -n "$SPEED" | tail -c 1) SPEED=$(echo "$SPEED" | sed 's/.$//' ) if [ "$UNIT" = K ]; then SPEED="$(echo "$SPEED" | \ awk '{MB = $1 / 1024 ; print MB}')" fi echo $SPEED else echo 0 fi } ## # Extract the time to go from the 'zpool status' output. # # Responsible for the output see the ZFS source code: # - https://github.com/openzfs/openzfs/blob/ed81aacb0d0fcbf7e0c0745ea4556655050c26bf/usr/src/cmd/zpool/zpool_main.c#L4441 # - https://github.com/openzfs/zfs/blob/a1ca39655ba446c98a2f44b2b1d45d1059738885/cmd/zpool/zpool_main.c#L7537 # # For example: # Input: # ... # scan: scrub in progress since Sun Aug 13 00:24:02 2017 # 7,34T scanned out of 10,1T at 57,4M/s, 14h12m to go # 0 repaired, 72,38% done # ... # # Other solutions: # # Not working on FreeBSD: # grep -P -o '(?<=, )[[:digit:]]*h[[:digit:]]*m(?= to go)' # # Parameters: # $1: The name of the pool. # # Return: # The time to go in seconds as an integer number. ## _grab_time_to_go() { local SECONDS MINUTES HOURS DAYS SECONDS=0 MINUTES=0 HOURS=0 DAYS=0 _get_line() { _get_zpool_status_stdout "$1" | grep ' to go' } # some sh implementations (can) interpret numbers with leading zeros as octal numbers. _remove_leading_zeros() { local NUMBER NUMBER="$(echo "$1" | sed -E 's/^0*//')" if [ -n "$NUMBER" ]; then echo "$NUMBER" else echo 0 fi } if [ -n "$(_get_line "$1" | grep '..:..:.. to go')" ]; then if [ -n "$(_get_line "$1" | grep days)" ]; then DAYS="$(_get_line "$1" | sed -E 's/^.* (.*) days .*$/\1/')" fi HOURS="$(_get_line "$1" | sed -E 's/^.* (..):.*$/\1/')" MINUTES="$(_get_line "$1" | sed -E 's/^.* ..:(..):.*$/\1/')" SECONDS="$(_get_line "$1" | sed -E 's/^.* ..:..:(..).*$/\1/')" elif [ -n "$(_get_line "$1" | grep 'h.*m.*to go')" ]; then _get_duration() { _get_line "$1" | sed -E 's/^.*, (.*h.*m) to go.*$/\1/' } HOURS=$(_get_duration "$1" | sed 's/h.*//') MINUTES=$(_get_duration "$1" | sed -E 's/^.*, (.*h.*m) to go.*$/\1/' | sed 's/.*h//' | sed 's/m//') fi DAYS="$(_remove_leading_zeros "$DAYS")" HOURS="$(_remove_leading_zeros "$HOURS")" MINUTES="$(_remove_leading_zeros "$MINUTES")" SECONDS="$(_remove_leading_zeros "$SECONDS")" echo $(((DAYS * 1440 + HOURS * 60 + MINUTES) * 60 + SECONDS)) } ## # Get the stdout of the command 'zpool status' # To avoid that the command 'zpool status' is executed more than once. # # Parameters: # $1: The name of the pool. ## _get_zpool_status_stdout() { if [ -n "$ZPOOL_STATUS_STDOUT" ] && [ "$CURRENT_POOL" = "$1" ]; then echo "$ZPOOL_STATUS_STDOUT" else CURRENT_POOL=$1 ZPOOL_STATUS_STDOUT="$(zpool status "$1")" > /dev/null 2>&1 ZPOOL_STATUS_EXIT_CODE=$? echo "$ZPOOL_STATUS_STDOUT" fi } ## # Check one ZFS pool. # # Parameters: # $1: The name of the pool. ## _check_one_pool() { local POOL NOW LAST_SCRUB DIFF PROGRESS SPEED TIME POOL="$1" CURRENT_POOL= ZPOOL_STATUS_STDOUT= ZPOOL_STATUS_EXIT_CODE= _get_zpool_status_stdout "$1" > /dev/null 2>&1 if [ "$ZPOOL_STATUS_EXIT_CODE" != 0 ]; then PERFORMANCE_DATA= MESSAGE="UNKNOWN: '${POOL}' is no ZFS pool." STATE=$STATE_UNKNOWN return fi NOW=$(_now_to_timestamp) LAST_SCRUB=$(_grab_last_scrub_timestamp "$POOL") if [ -z "$LAST_SCRUB" ]; then PERFORMANCE_DATA= MESSAGE="UNKNOWN: The pool '${POOL}' has never had a scrub." STATE=$STATE_UNKNOWN return fi DIFF=$((NOW - LAST_SCRUB)) PROGRESS="$(_grab_progress "$POOL")" SPEED="$(_grab_speed "$POOL")" TIME="$(_grab_time_to_go "$POOL")" STATE=STATE_UNKNOWN if [ "$DIFF" -gt "$OPT_CRITICAL" ]; then STATE=$STATE_CRITICAL MESSAGE="CRITICAL:" elif [ "$DIFF" -gt "$OPT_WARNING" ]; then STATE=$STATE_WARNING MESSAGE="WARNING:" else STATE=$STATE_OK MESSAGE="OK:" fi PERFORMANCE_DATA="$(_performance_data_one_pool \ "${POOL}" "${DIFF}" "${PROGRESS}" "${SPEED}" "${TIME}")" MESSAGE="$MESSAGE The last scrub on zpool '${POOL}' was \ performed on $(_timestamp_to_datetime $LAST_SCRUB)." } _check_multiple_pools() { local POOL GLOBAL_STATE=0 MESSAGES PERFORMANCE_DATAS for POOL in $@; do _check_one_pool "$POOL" if [ -n "${PERFORMANCE_DATAS}" ]; then PERFORMANCE_DATAS="${PERFORMANCE_DATAS} \ ${PERFORMANCE_DATA}" else PERFORMANCE_DATAS="${PERFORMANCE_DATA}" fi if [ -n "${MESSAGES}" ]; then MESSAGES="${MESSAGES} ${MESSAGE}" else MESSAGES="${MESSAGE}" fi # If one pool is critical the global state turns to # critical. if [ "${STATE}" -eq 2 ]; then GLOBAL_STATE=2 # warning elif [ "${STATE}" -eq 1 ] && \ ! [ ${GLOBAL_STATE} -eq 2 ]; then GLOBAL_STATE=1 # unkown elif [ "${STATE}" -eq 3 ] && \ ! [ ${GLOBAL_STATE} -eq 1 ] && \ ! [ ${GLOBAL_STATE} -eq 2 ]; then GLOBAL_STATE=3 fi done PERFORMANCE_DATA="${PERFORMANCE_DATAS}" MESSAGE="${MESSAGES}" STATE="$GLOBAL_STATE" } ## # Main function to jump in on execution. # # Parameters: # $@: All positional arguments provided by the script. ## _main() { local POOL _getopts $@ if [ "$OPT_WARNING" -gt "$OPT_CRITICAL" ]; then echo ' must be smaller than .' >&2 echo "$USAGE" >&2 exit $STATE_UNKNOWN fi if [ -n "$OPT_POOL" ]; then _check_one_pool "$OPT_POOL" else _check_multiple_pools $(zpool list -H -o name) fi if [ -n "$PERFORMANCE_DATA" ]; then MESSAGE="$MESSAGE | \ ${PERFORMANCE_DATA}" fi echo "$MESSAGE" exit $STATE } ## This SEPARATOR is required for test purposes. Please don’t remove! ## _main $@