#!/usr/bin/env bash # Written by Rudy Broersma, Duocast BV # NVMe functionality added by Lukas de Boer # # This script needs work: # We assume Samsung SSDs or Intel SSDs # We assume 'Wear_Leveling_Count' or 'Media_Wearout_Indicator' (can be different for different manufacturers) # # According to http://www.anandtech.com/show/8239/update-on-samsung-850-pro-endurance-vnand-die-size # a drive should continue to work fine even when the value reaches 0, because: # "according to JEDEC spec the P/E cycle rating is with one-year data retention, # meaning that there can still be plenty of life left." # # I purposely set the values to 90 (warning) and 80 (critical) to get very early warnings # to see if this check works okay. We can later set it to reasonable values such as 1 # and 10 # # WARNING: Block devices naming scheme is dependent on device creation time. # so, if you create a new RAID array on an already running system there is the possibility # that eg. sda becomes sdb. If this is the case, you need to reboot in order for this # script to work properly. # # ChangeLog: # Version Change # 1.0 Initial release # 1.1 Support for multiple LSI controllers # 1.2 Correctly identify which blockdevice (/dev/sdX) belongs to which controller # 1.3 Replace lsblk -S with lsscsi. -S flag on lsblk is not available on Wheezy # 1.4 Add support for NVMe disks # 1.5 Bugfix: Actually run checkTooling before using the tools that needs checking # 1.6 Add support for multiple controllers (only auto and lsi as of this moment) # 1.7 Add support for ignoring specific models from a brand mentioned in $BRAND # 1.8 Fix bug if drive supports both 177 and 233 outputs # 1.9 Ignore ZFS devices # 2.0 Also add Samsung as brand (not all uppercase) # 2.1 Add fix for Samsung SSDs on NVMe RAID Controller. New indicator. # 2.2 Fix voor nvme-cli, now requires brand plugin. Currently only Intel is supported. # 2.3 Fix leading zero in WLC # TODO: # The test -x $STORCLI should be removed. # Multicontroller support should be rewritten. Its ugly now. SUDO="sudo" VERSION="2.3" STORCLI="/usr/bin/storcli" TWARE="/usr/bin/tw_cli" BC="/usr/bin/bc" SMARTCTL="/usr/sbin/smartctl" NVMECLI="/usr/sbin/nvme" LSBLK="/bin/lsblk" MESSAGE="" EXITCODE=0 PREFIX="SSD OK" TESTMODE=0 DEBUG=0 HAS_SSD=0 BRAND="SAMSUNG\|INTEL\|Samsung" IGNORE="HD642JJ" # This is a Samsung rotating harddisk, that we should ignore # Declare as arrays declare -A CONTROLLERS declare -A BLOCKDEVICE declare -A DIDLIST shopt -s extglob # Parse commandline parameters for i in "$@" do case $i in -c=*|--card=*) CARD="${i#*=}" shift # past argument=value ;; -d=*|--device=*) DEVICE="${i#*=}" shift # past argument=value ;; -b=*|--brand=*) BRAND="${i#*=}" BRAND="${BRAND^^}" shift ;; -h|--help) echo -e "check_ssd - version $VERSION.\n\nThis tool can be used to check the life expectancy of a SSD drive by checking wear leveling indicators using smartmontools." echo -e "You can specify the following parameters:\n\n" echo -e "-c=, --card= Instead of autodetecting using lspci, set the card type. We accept \"lsi\", \"3ware\" and \"auto\" for now. Auto is autodetect" echo -e "-d=, --device= The blockdevice we should use for calling smartmontools. Can be any blockdevice in /dev, but is driver specific" echo -e "-b=, --brand= The brand of SSD to search for. We accept \"samsung\" and \"intel\"" echo -e "" echo -e "-d, --debug Enable debug output" echo -e "-t, --test Only test if there are SSDs present in the system. exits with 0 when found, 1 if not" echo -e "-h, --help Show what you are reading now!" echo -e "-v, --version Show version" echo -e "\n\n" exit; ;; -t|--test) TESTMODE=1 ;; -d|--debug) DEBUG=1 ;; -n | --nosudo) SUDO="" ;; -v|--version) echo $VERSION exit ;; --default) DEFAULT=YES shift # past argument with no value ;; *) # unknown option ;; esac done # End parsing parameters function searchController { if [ -z ${CARD+X} ]; then # Check for local devices LOCAL_SSD=$($LSBLK -d -o name,rota | /usr/bin/awk '$2 == 0' | wc -l) if [ $LOCAL_SSD -gt 0 ]; then CONTROLLER="auto" else CONTROLLER=$(lspci | grep RAID) fi else CONTROLLER=${CARD,,} #,, means lowercase fi } function checkTooling { if [ ! -f $LSBLK ]; then echo "UNKNOWN: lsblk not found"; exit 4; fi case ${CONTROLLER,,} in *lsi*) if [ ! -f $STORCLI ]; then echo "UNKNOWN: storcli tool not found"; exit 4; fi ;; *3ware*) if [ ! -f $TWARE ]; then echo "UNKNOWN: tw_cli tool not found"; exit 4; fi ;; esac # Only check for nvme-cli if we have nvme-cli LOCAL_NVME=$($LSBLK -d -o name,rota | /usr/bin/awk '$2 == 0' | grep nvme | wc -l) if [ $LOCAL_NVME -gt 0 ]; then if [ ! -f $NVMECLI ]; then echo "UNKNOWN: nvme-cli not found"; exit 4; fi fi if [ ! -f $SMARTCTL ]; then echo "UNKNOWN: smartctl not found"; exit 4; fi if [ ! -f $BC ]; then echo "UNKNOWN: bc not found"; exit 4; fi } function getDIDlist { case ${CONTROLLER,,} in *lsi*) if [ $DEBUG -eq 1 ]; then echo "LSI/AVAGO controller detected"; fi; #DRIVER="megaraid" ITERATE=0 for c in $(storcli show | egrep "^ [0-9]" | awk '{ print $1 }'); do DRIVERS[$ITERATE]="megaraid" CONTROLLERS[$ITERATE]=$c # BLOCKDEVICE[$ITERATE]=`$LSBLK -d -S -n --output HCTL,NAME | sed "s/:/ /g" | awk '{ print \$1 " " \$5 }' | grep ^$c | tail --lines 1 | awk '{ print "/dev/" $2 }'` BLOCKDEVICE[$ITERATE]=$(lsscsi | sed "s/\[//g; s/\]//g" | awk '{ print $1 " " $6 }' | sed "s/:/ /g" | awk '{ print \$1 " " \$5 }' | grep ^$c | tail --lines 1 | awk '{ print $2 }') DIDLIST[$ITERATE]=$($SUDO $STORCLI /c$ITERATE show | grep SSD | awk '{ print $2 }') if [ $DEBUG -eq 1 ]; then echo "Found controller #${CONTROLLERS[$ITERATE]} with associated blockdevice ${BLOCKDEVICE[$ITERATE]}"; fi ITERATE=$((ITERATE + 1)) done ;; *3ware*) if [ $DEBUG -eq 1 ]; then echo "3ware controller detected"; fi; # DRIVER="3ware" TWARE_CID=$($SUDO $TWARE show | grep ^c | awk '{ print $1 }' | head --lines 1) DRIVERS[0]="3ware" CONTROLLERS[0]=0 BLOCKDEVICE[0]="/dev/twl0" # # This seems to be the proper device regardless of which unit (and thus device name) the drive belongs too DIDLIST[0]=$($SUDO $TWARE /$TWARE_CID show | grep "$BRAND" | grep -v "$IGNORE" | awk '{ print $1 }' | sed "s/p//g") ;; *auto*) if [ $DEBUG -eq 1 ]; then echo "auto controller detected"; fi; ITERATE=0 DRIVERS[$ITERATE]="auto" CONTROLLERS[$ITERATE]=0 BLOCKDEVICE[$ITERATE]=$($LSBLK -d -o name,rota | grep "$DEVICE" | /usr/bin/awk '$2 == 0 { print "/dev/"$1 }') DIDLIST[$ITERATE]=$($LSBLK -d -o name,rota | grep "$DEVICE" | /usr/bin/awk '$2 == 0 { print "/dev/"$1 }') CONTROLLERITERATE=0 for c in $(test -x $SUDO $STORCLI && storcli show | egrep "^ [0-9]" | awk '{ print $1 }'); do if [ $DEBUG -eq 1 ]; then echo "LSI controller detected"; fi ITERATE=$((ITERATE + 1)) DRIVERS[$ITERATE]="megaraid" CONTROLLERS[$ITERATE]=$c BLOCKDEVICE[$ITERATE]=$(lsscsi | grep -v "cd/dvd" | sed "s/\[//g; s/\]//g" | awk '{ print $1 " " $6 }' | sed "s/:/ /g" | awk '{ print \$1 " " \$5 }' | grep ^$c | tail --lines 1 | awk '{ print $2 }') DIDLIST[$ITERATE]=$(test -x $SUDO $STORCLI && $SUDO $STORCLI /c$CONTROLLERITERATE show | grep SSD | awk '{ print $2 }') if [ $DEBUG -eq 1 ]; then echo "Found controller #${CONTROLLERS[$ITERATE]} with associated blockdevice ${BLOCKDEVICE[$ITERATE]}"; fi; CONTROLLERITERATE=$((CONTROLLERITERATE + 1)) done ;; *) echo "UNKNOWN: Unknown controller or no controller found" exit 4 ;; esac } function hasSSDs { # if [ "$CONTROLLER" = "auto" ]; then # if [ `echo $DIDLIST | sed '/^\s*$/d' | wc -l` -ne 0 ]; then # HAS_SSD=1 # fi # else CONTROLLERCOUNT=${#CONTROLLERS[@]} if [ $DEBUG -eq 1 ]; then echo "Found $CONTROLLERCOUNT controllers"; fi; for i in $(seq 0 $((CONTROLLERCOUNT - 1))) do if [ $(echo ${DIDLIST[$i]} | sed '/^\s*$/d' | wc -l) -ne 0 ]; then if [ $DEBUG -eq 1 ]; then echo "Found SSD disk on controller $i"; fi; HAS_SSD=1 else if [ $DEBUG -eq 1 ]; then echo "Found no disk(s) on controller $i"; fi; fi done # fi } function checkDID () { if [ $DEBUG -eq 1 ]; then echo "Checking DID on controller $1"; fi DIDLIST_CUR="${DIDLIST[$1]}" for d in $(echo $DIDLIST_CUR) do if [ $DEBUG -eq 1 ]; then echo "Checking DID number $d on controller $1 with driver ${DRIVERS[$1]}"; fi if [ "${DRIVERS[$1]}" = "auto" ]; then if [[ $d =~ ^/dev/nvme.* ]]; then if $SUDO $SMARTCTL -a $d | grep -qi intel; then VALUE=$(echo $($SUDO $NVMECLI intel smart-log-add $d | grep "^wear_leveling" | tr -d '%' | awk '{ print $3 }') + 0 | bc) elif [[ $d =~ ^/dev/zd.* ]]; then # skip zvol block device echo "Skipping" continue elif $SUDO $SMARTCTL -a $d | grep -qi "samsung\|sabrent"; then [ $DEBUG -eq 1 ] && echo "checking samsung" VALUE=$(echo $($SUDO $NVMECLI smart-log $d | grep "^available_spare " | tr -d '%' | awk '{ print $3 }') + 0 | bc) fi else RE='^[0-9]+$' VALUE_OLD=$($SUDO $SMARTCTL -A -d auto $d | grep "^177\|^233" | tail --lines 1 | awk '{ print $4 }') VALUE_NEW=$($SUDO $SMARTCTL -H -d auto $d | grep "^Percentage used endurance indicator" | tail --lines 1 | sed "s/%//g" | awk '{ print $5 }') if [[ $VALUE_OLD =~ $RE ]]; then VALUE=$VALUE_OLD elif [[ $VALUE_NEW =~ $RE ]]; then VALUE=$((100-$VALUE_NEW)) else continue fi # VALUE=`echo \`$SUDO $SMARTCTL -A -d auto $d | grep "^177\|^233" | tail --lines 1 | awk '{ print $4 }'\` + 0 | bc` fi else RE='^[0-9]+$' VALUE_OLD=$($SUDO $SMARTCTL -A -d ${DRIVERS[$1]},$d ${BLOCKDEVICE[$1]} | grep "^177\|^233" | tail --lines 1 | awk '{ print $4 }') VALUE_NEW=$($SUDO $SMARTCTL -H -d ${DRIVERS[$1]},$d ${BLOCKDEVICE[$1]} | grep "^Percentage used endurance indicator" | tail --lines 1 | sed "s/%//g" | awk '{ print $5 }') if [[ $VALUE_OLD =~ $RE ]]; then VALUE=$VALUE_OLD else VALUE=$((100-$VALUE_NEW)) fi fi # Remove leading zeros VALUE=$(expr $VALUE + 0) case $VALUE in 100) MESSAGE+="Drive $d on $1 WLC/MWI $VALUE. " ;; ?(0)[3-9][0-9]) MESSAGE+="Drive $d on $1 WLC/MWI $VALUE. " ;; ?(0)[2][0-9]) MESSAGE+="Drive $d on $1 MEDIUM WLC/MWI ($VALUE). " if [ "$EXITCODE" -eq "0" ]; then EXITCODE=1 PREFIX="SSD WARNING" fi ;; ?(0)[0-1]*) MESSAGE+="Drive $d on $1 CRITICAL WLC/MWI ($VALUE). " EXITCODE=2 PREFIX="SSD CRITICAL" ;; *) MESSAGE="Unknown error occured." EXITCODE=4 PREFIX="SSD UNKNOWN" ;; esac done } function checkSSDs { for CONTROLLER_SEQ in $(seq 0 $(($CONTROLLERCOUNT - 1))) do if [ $DEBUG -eq 1 ]; then echo "Checking controller $CONTROLLER_SEQ"; fi if [ -z "${DIDLIST[$CONTROLLER_SEQ]}" ]; then if [ $DEBUG -eq 1 ]; then echo "Controller $CONTROLLER_SEQ has no SSDs"; fi else if [ $DEBUG -eq 1 ]; then echo "Controller $CONTROLLER_SEQ does have SSDs"; fi checkDID "$CONTROLLER_SEQ" fi done } checkTooling # must run as first searchController getDIDlist hasSSDs if [ $TESTMODE -eq 1 ]; then if [ $HAS_SSD -eq 0 ]; then if [ $DEBUG -eq 1 ]; then echo "No SSDs found in the system on controller $CONTROLLER"; fi; exit 1 else if [ $DEBUG -eq 1 ]; then echo "SSDs are found in the system on controller $CONTROLLER"; fi; exit 0 fi else if [ $HAS_SSD -ne 0 ]; then checkSSDs else MESSAGE="No SSDs found in this system on controller $CONTROLLER" EXITCODE=4 PREFIX="UNKNOWN" fi MESSAGE="$PREFIX: $MESSAGE" echo -e $MESSAGE exit $EXITCODE fi