#!/bin/bash # # Nagios plugin for Software/Hardware RAID and non-RAID storages; asmo@conseev # # Supported RAID controllers: LSI, 3ware, Areca, Adaptec # notes: # - For Areca - CLI version <= 1.14.2 is required # - For Adaptec - pcre2grep tool (pcre.org) is required for SMART regexp # - For Adaptec - compat-libstdc++ package is required, CLI is dynamically linked # # usage: ./check-storage # ./check-storage show_error_counters # display counters of critical events # # v0.4(4) - 22-01-2020 # ### STATE_OK=0 STATE_CRITICAL=2 # Detect all controllers and check each controller's health, all VDs per controller # and all physical drives behind controller # LSI RAID Controllers function lsi_hw_raid_check() { is_critical=0 # storcli absolute path # use the old version if installed if [[ -x /opt/storcli-1.15 ]]; then # 1.15 works with 2.6.18 kernel (so I don't have to mess with MegaCLI, uff.) r_cli=/opt/storcli-1.15 else r_cli=/opt/storcli64 fi if [[ ! -x ${r_cli} ]]; then echo "${r_cli} not found, exiting" ; exit 2 ; fi # use SHM in case the OS became RO, capture $r_cli for MegaCLI if needed ${r_cli} show >/dev/shm/.hwr_ctl.tmp; cli_ret=$? # amount of RAID controllers ctl_ids=$(cat /dev/shm/.hwr_ctl.tmp|awk '/^-------+$/ && a++ {next}; a == 2') echo -n "[STORAGE][LSI]::" # Check every available controller while read -r current_ctl; do current_ctl_id=$(echo ${current_ctl}|awk '{print $1}') echo -n "CTL: ${current_ctl_id}: " # Check and report health (Hlth) of current controller # anything else than 'Opt' state is considered critical current_ctl_hlth=$(echo ${current_ctl}|grep -oE '[^ ]+$') if [[ "${current_ctl_hlth}" = "Opt" ]] || [[ "${current_ctl_hlth}" = "NdAtn" ]]; then echo -n "Health: OK (${current_ctl_hlth});;" else echo -n "Health: CRITICAL (${current_ctl_hlth});;" let is_critical+=1 fi # Check VDs health # anything else than 'Optl' state is considered critical ${r_cli} /c${current_ctl_id}/vall show|awk '/^-------+$/ && a++ {next}; a == 2' >/dev/shm/.vd_list.tmp vd_ids=$(cat /dev/shm/.vd_list.tmp) # check whether VDs are configured vd_present=$(cat /dev/shm/.vd_list.tmp|wc -l) if [[ ${vd_present} -gt 0 ]]; then while read -r current_vd; do # get VD details vd_type=$(echo ${current_vd}|awk '{print $2}') ; vd_size=$(echo ${current_vd}|awk '{print $9,$10}') vd_state=$(echo ${current_vd}|awk '{print $3}') ; vd_dg_vd=$(echo ${current_vd}|awk '{print $1}') echo -n "VD: ${vd_dg_vd}: " if [[ "${vd_state}" = "Optl" ]]; then echo -n "Health: OK (DG/VD: ${vd_dg_vd}, type/size/state: ${vd_type}/${vd_size}/${vd_state});;" else echo -n "Health: CRITICAL (DG/VD: ${vd_dg_vd}, type/size/state: ${vd_type}/${vd_size}/${vd_state});;" let is_critical+=1 fi done <<< "${vd_ids}" fi # check for drives with and without enclosures enc_drives=$(${r_cli} /c${current_ctl_id}/eALL/sALL show|grep "^Description"|awk -F"= " '{print $NF}') non_enc_drives=$(${r_cli} /c${current_ctl_id}/sALL show|grep "^Description"|awk -F"= " '{print $NF}') # Check all drives health (Media/BBM counters) in current controller ($current_ctl_id) # values higher than 0 are considered critical function lsi_hw_raid_check_drives() { while read -r current_drive && [[ ! -z ${current_drive} ]]; do err_media=$(${r_cli} ${current_drive} show all|grep "Media Error Count"|grep -oE '[^ ]+$') # err_other=$(${r_cli} ${current_drive} show all|grep "Other Error Count"|grep -oE '[^ ]+$') err_bbm=$(${r_cli} ${current_drive} show all|grep "BBM Error Count"|grep -oE '[^ ]+$') if [[ -z ${err_bbm} ]]; then err_bbm=NA; fi if [[ ! ${err_media} -gt ${max_media_err_cnt} ]] && [[ ${err_bbm} -eq 0 ]]; then echo -n "drv: ${current_drive} - OK (ERR count(Media/BBM): ${err_media}/${err_bbm});" else echo -n "drv: ${current_drive} - CRITICAL (ERR count(Media/BBM): ${err_media}/${err_bbm});" let is_critical+=1 fi done <<< "${drive_ids}" } # Get all drive IDs for further health check if [[ "${enc_drives}" != "No drive found!" ]]; then drive_ids=$(${r_cli} /c${current_ctl_id}/eALL/sALL show all|grep "^Drive .* State :$"|awk '{print $2}') lsi_hw_raid_check_drives fi if [[ "${non_enc_drives}" != "No drive found!" ]]; then drive_ids=$(${r_cli} /c${current_ctl_id}/sALL show all|grep "^Drive .* State :$"|awk '{print $2}') lsi_hw_raid_check_drives fi done <<< "${ctl_ids}" rm -f /dev/shm/.hwr_ctl.tmp /dev/shm/.vd_list.tmp } # Areca RAID Controllers # note: this function will NOT work with latest 'areca-cli', v1.14.2 is required function areca_hw_raid_check() { is_critical=0 # Areca cli absolute path r_cli=/opt/areca_cli if [[ ! -x ${r_cli} ]]; then echo "${r_cli} not found, exiting" ; exit 2 ; fi # amount of RAID controllers ctl_ids=$(${r_cli} hw info|grep "^\[Enclosure#"|awk '{print $1}'|cut -d'#' -f2) echo -n "[STORAGE][Areca]::" # Check every available controller while read -r current_ctl; do current_ctl_id=${current_ctl} echo -n "CTL: ${current_ctl_id}: " # Areca directory for cli config r_cli_cfg_path=$(dirname ${r_cli}) # Areca controller is protected by default password so it needs to be set # per controller otherwise some checks will be denied ${r_cli} 1>/dev/null 2>/dev/null set curctrl=${current_ctl_id} password=0000 # Checking controller status via return value current_ctl_hlth=$(${r_cli} 1>/dev/null 2>/dev/null hw info;echo $?) if [[ ${current_ctl_hlth} -eq 0 ]]; then echo -n "Health: OK (${current_ctl_hlth});;" else echo -n "Health: CRITICAL (${current_ctl_hlth});;" let is_critical+=1 fi ## Check RAID set(s) health via RAID set functions (rsf) ${r_cli} rsf info|awk '/^=======+$+/ && a++ {next}; a == 1;'|grep "===" -v >/dev/shm/.rsf_list.tmp rsf_ids=$(cat /dev/shm/.rsf_list.tmp) while read -r current_rs; do # get RS details rsf_name=$(echo ${current_rs}|awk '{print $2}') ; rsf_disks_num=$(echo ${current_rs}|awk '{print $3}') rsf_size=$(echo ${current_rs}|awk '{print $4}') ; rsf_state=$(echo ${current_rs}|awk '{print $7}') echo -n "RS: ${rsf_name}: " if [[ "${rsf_state}" = "Normal" ]]; then echo -n "Health: OK (State: ${rsf_state}, size/disks: ${rsf_size}/${rsf_disks_num});;" else echo -n "Health: CRITICAL (State: ${rsf_state}, size/disks: ${rsf_size}/${rsf_disks_num});;" let is_critical+=1 fi done <<< "${rsf_ids}" ## Check RAID Volume set(s) health via RAID volume functions (vsf) ${r_cli} vsf info|awk '/^=======+$+/ && a++ {next}; a == 1;'|grep "===" -v >/dev/shm/.vsf_list.tmp vsf_ids=$(cat /dev/shm/.vsf_list.tmp) while read -r current_vs; do current_vd_id=$(echo ${current_vs}|awk '{print $1}') ${r_cli} vsf info vol=${current_vd_id} | \ awk '/^=======+$+/ && a++ {next}; a == 1;'|grep "===" -v >/dev/shm/.curr_vd_hlth.tmp # get VS details vsf_name=$(grep "^Volume Set Name" /dev/shm/.curr_vd_hlth.tmp|awk -F": " '{print $2}'|sed 's/ //g') vsf_level=$(grep "^Raid Level" /dev/shm/.curr_vd_hlth.tmp|awk -F": " '{print $2}') vsf_size=$(grep "^Volume Capacity" /dev/shm/.curr_vd_hlth.tmp|awk -F": " '{print $2}') vsf_state=$(grep "^Volume State" /dev/shm/.curr_vd_hlth.tmp|awk -F": " '{print $2}') echo -n "VS: ${vsf_name}: " if [[ "${vsf_state}" = "Normal" ]]; then echo -n "Health: OK (State: ${vsf_state}, level/size ${vsf_level}/${vsf_size});;" else echo -n "Health: CRITICAL (State: ${vsf_state}, level/size ${vsf_level}/${vsf_size});;" let is_critical+=1 fi done <<< "${vsf_ids}" # Get all drive IDs for further health check ${r_cli} disk info|awk '/^=======+$+/ && a++ {next}; a == 1;'|grep -E '(===| N.A.)' -v >/dev/shm/.disk_list.tmp disk_ids=$(cat /dev/shm/.disk_list.tmp) while read -r current_disk; do current_disk_id=$(echo ${current_disk}|awk '{print $1}') ${r_cli} disk info drv=${current_disk_id} | \ awk '/^=======+$+/ && a++ {next}; a == 1;'|grep "===" -v >/dev/shm/.curr_disk_hlth.tmp # Get current drive details cur_drive_loc=$(grep "^Device Location" /dev/shm/.curr_disk_hlth.tmp|awk -F": " '{print $2}') cur_drive_media=$(grep "^Media Error Count" /dev/shm/.curr_disk_hlth.tmp|awk -F": " '{print $2}') cur_drive_state=$(grep "^Device State" /dev/shm/.curr_disk_hlth.tmp|awk -F": " '{print $2}') if [[ ! ${cur_drive_media} -gt ${max_media_err_cnt} ]] && [[ "${cur_drive_state}" = "NORMAL" ]]; then echo -n "drv: ${cur_drive_loc} - OK (ERR count/State: ${cur_drive_media}/${cur_drive_state});" else echo -n "drv: ${cur_drive_loc} - CRITICAL (ERR count/State: ${cur_drive_media}/${cur_drive_state});" let is_critical+=1 fi done <<< "${disk_ids}" done <<< "${ctl_ids}" rm -f /dev/shm/.rsf_list.tmp /dev/shm/.vsf_list.tmp /dev/shm/.curr_vd_hlth.tmp /dev/shm/.disk_list.tmp /dev/shm/.curr_disk_hlth.tmp } # 3Ware RAID Controllers function 3ware_hw_raid_check() { is_critical=0 # 3Ware cli absolute path r_cli=/opt/tw_cli.x86_64 if [[ ! -x ${r_cli} ]]; then echo "${r_cli} not found, exiting" ; exit 2 ; fi # amount of RAID controllers ctl_ids=$(${r_cli} show|awk '/^----+$+/ && a++ {next}; a == 1'|grep "^----" -v|sed '/^$/d') echo -n "[STORAGE][3Ware]::" # check every available controller while read -r current_ctl; do current_ctl_id=$(echo ${current_ctl}|awk '{print $1}') echo -n "CTL: ${current_ctl_id}: " # CTL status is critical if NotOpt > 1 current_ctl_hlth=$(echo ${current_ctl}|awk '{print $6}') if [[ "${current_ctl_hlth}" -eq 0 ]]; then echo -n "Health: OK (NotOpt:${current_ctl_hlth});;" else echo -n "Health: CRITICAL (NotOpt:${current_ctl_hlth});;" let is_critical+=1 fi # check unit status, critical if not 'OK', pos 3 ${r_cli} /${current_ctl_id} show unitstatus|awk '/^----+$+/ && a++ {next}; a == 1'|grep "^----" -v|sed '/^$/d' >/dev/shm/.3w_unit_list.tmp unit_list=$(cat /dev/shm/.3w_unit_list.tmp) while read -r current_unit; do # get unit details unit_name=$(echo ${current_unit}|awk '{print $1}') ; unit_type=$(echo ${current_unit}|awk '{print $2}') unit_size=$(echo ${current_unit}|awk '{print $7}') ; unit_state=$(echo ${current_unit}|awk '{print $3}') echo -n "Unit: ${unit_name}: " if [[ "${unit_state}" = "OK" ]] || [[ "${unit_state}" = "VERIFYING" ]]; then echo -n "Health: OK (Status: ${unit_state}, type/size: ${unit_type}/${unit_size}GB);;" else echo -n "Health: CRITICAL (Status: ${unit_state}, type/size: ${unit_type}/${unit_size}GB);;" let is_critical+=1 fi done <<< "${unit_list}" # check all drives status, critical if not 'OK', pos 2 ${r_cli} /${current_ctl_id} show drivestatus|grep NOT-PRESENT -v|awk '/^----+$+/ && a++ {next}; a == 1'|grep "^----" -v | \ sed '/^$/d'|sed 's/ GB /GB /g' >/dev/shm/.3w_drive_list.tmp drive_list=$(cat /dev/shm/.3w_drive_list.tmp) while read -r current_drive; do # get drive details, check status & reallocated sectors, critical if non 'OK' or realloc counter > 0 drive_vport=$(echo ${current_drive}|awk '{print $1}') ; drive_type=$(echo ${current_drive}|awk '{print $5}') drive_size=$(echo ${current_drive}|awk '{print $4}') ; drive_state=$(echo ${current_drive}|awk '{print $2}') rasect_cnt=$(${r_cli} /${current_ctl_id}/${drive_vport} show rasect|grep =|awk '{print $NF}') echo -n "Drive: ${drive_vport}: " if [[ "${drive_state}" = "OK" ]] || [[ "${drive_state}" = "VERIFYING" ]] && [[ ! ${rasect_cnt} -gt ${max_realloc_cnt} ]]; then echo -n "Health: OK (Status/ReallocSect: ${drive_state}/${rasect_cnt}, VPort/Size/Type: ${drive_vport}/${drive_size}GB/${drive_type});;" else echo -n "Health: CRITICAL (Status/ReallocSect: ${drive_state}/${rasect_cnt}, VPort/Size/Type: ${drive_vport}/${drive_size}GB/${drive_type});;" let is_critical+=1 fi done <<< "${drive_list}" done <<< "${ctl_ids}" rm -f /dev/shm/.3w_unit_list.tmp # /dev/shm/.3w_drive_list.tmp } # Adaptec RAID Controllers # note: arcconf is dynamically linked and require libstdc++.so.5 which can be installed # via 'yum install compat-libstdc++-33-3.2.3-72.el7.x86_64', seems it's basent by default function adaptec_hw_raid_check() { is_critical=0 # arcconf absolute path r_cli=/opt/arcconf-7.31.x64 if [[ ! -x ${r_cli} ]]; then echo "${r_cli} not found, exiting" ; exit 2 ; fi # need to scan for controller IDs manually as there's no LIST command in arcconf 7.31 (or im blind) for i in 0 1 2 3 4 5 6 7 8 9 10; do is_ctl=$(${r_cli} 1>/dev/null 2>/dev/null getstatus $i;echo $?) if [[ ${is_ctl} -eq 0 ]]; then ctl_ids+=("${i}") fi done echo -n "[STORAGE][Adaptec]::" # check every available controller for current_ctl in $(echo ${ctl_ids[@]}); do # CTL status is critical if other than 'Optimal' current_ctl_hlth=$(${r_cli} getconfig ${current_ctl} ad|grep "Controller Status"|awk '{print $4}') if [[ "${current_ctl_hlth}" = "Optimal" ]]; then echo -n "Health: OK (${current_ctl_hlth});;" else echo -n "Health: CRITICAL (${current_ctl_hlth});;" let is_critical+=1 fi # check health of logical devices (LD) for current_vol in $(${r_cli} getconfig ${current_ctl} ld|grep "^Logical device number"|awk '{print $NF}'); do # get LD details ${r_cli} getconfig ${current_ctl} ld ${current_vol} >/dev/shm/.adp_ldvol.tmp ld_level=$(grep "RAID level" /dev/shm/.adp_ldvol.tmp|awk '{print $NF}') ld_size=$(grep "Size" /dev/shm/.adp_ldvol.tmp|awk '{print $3$4}') ld_state=$(grep "Status of logical device" /dev/shm/.adp_ldvol.tmp|awk '{print $NF}') echo -n "LD: ${current_vol}: " if [[ "${ld_state}" = "Optimal" ]]; then echo -n "Health: OK (${ld_state}, level/size: ${ld_level}/${ld_size});;" else echo -n "Health: CRITICAL (${ld_state}, level/size: ${ld_level}/${ld_size});;" let is_critical+=1 fi # check LD segments state ld_segments=$(grep "Segment" /dev/shm/.adp_ldvol.tmp|awk '{print $2,$4,$5}') while read -r current_ld_segment; do seg_id=$(echo ${current_ld_segment}|awk '{print $1}') ; seg_state=$(echo ${current_ld_segment}|awk '{print $2}') seg_path=$(echo ${current_ld_segment}|awk '{print $3}') if [[ -z "${seg_path}" ]]; then seg_path=NA; fi if [[ "${seg_state}" = "Present" ]]; then echo -n "Segment ${current_ld_segment}: OK (${seg_state}, path: ${seg_path})" else echo -n "Segment ${current_ld_segment}: CRITICAL (${seg_state}, path: ${seg_path})" let is_critical+=1 fi done <<< "${ld_segments}" done # Adaptec, as absolutely worst RAID CLI on earth is forcing its users to parse that ugly SMART dump. # selected raw SMART values to check: # 0x05 - Reallocated Sectors Count # 0x0A - Spin Retry Count # 0xC5 - Current Pending Sector Count # 0xC6 - (Offline) Uncorrectable Sector Count[ # get SMART dump and wipe trailing spaces for further parsing ${r_cli} getsmartstats ${current_ctl} tabular |awk '{$1=$1;print}' >/dev/shm/.adp_rawsmart.tmp # get drive ID and selected raw values # format: 'ID attribute1,value attribute2,value attribute3,value attribute4,value' echo $(${bin_p2g} -M --om-separator=, -o3 -o4 -o6 \ "(PhysicalDriveSmartStats)(\n.*\n.*.)(.. [0-9]*$)|(0x05|0x0A|0xC5|0xC6)(\n.*\n.*\n\s*rawValue.*. )([0-9]*$)" \ /dev/shm/.adp_rawsmart.tmp) | sed 's/ .. /\n/g'|sed 's/^.. //g' >/dev/shm/.adp_sel_raw.tmp drive_list=$(cat /dev/shm/.adp_sel_raw.tmp) while read -r current_drive; do # get raw values per drive drive_id=$(echo ${current_drive}|awk '{print $1}') x05=$(echo ${current_drive}|awk '{print $2}'|cut -d',' -f2) ; x0a=$(echo ${current_drive}|awk '{print $3}'|cut -d',' -f2) xc5=$(echo ${current_drive}|awk '{print $4}'|cut -d',' -f2) ; xc6=$(echo ${current_drive}|awk '{print $5}'|cut -d',' -f2) echo -n "Drive: ${drive_id}:" if [[ ! ${x05} -gt ${max_realloc_cnt} ]] && [[ ${x0a} -eq 0 ]]; then if [[ ${xc5} -eq 0 ]] && [[ ${xc6} -eq 0 ]]; then echo -n "Health: OK (0x05/0x0A/0xC5/0xC6: ${x05}/${x0a}/${xc5}/${xc6});" else echo -n "Health: CRITICAL (0x05/0x0A/0xC5/0xC6: ${x05}/${x0a}/${xc5}/${xc6});" let is_critical+=1 fi else echo -n "Health: CRITICAL (0x05/0x0A/0xC5/0xC6: ${x05}/${x0a}/${xc5}/${xc6});" let is_critical+=1 fi done <<< "${drive_list}" done rm -f /dev/shm/.adp_ldvol.tmp /dev/shm/.adp_rawsmart.tmp /dev/shm/.adp_sel_raw.tmp } # Software RAID checks # Checks include data from 'mdadm -D' per array and following SMART values per drive: # 0x05 - Reallocated Sectors Count function software_raid_check() { is_critical=0 r_cli=${bin_mdadm} if [[ ! -x ${r_cli} ]]; then echo "${r_cli} not found, exiting" ; exit 2 ; fi # get all arrays names sw_arrays=$(grep -o "^md[_,a-z,0-9]*" /proc/mdstat) echo -n "[STORAGE][SWR]::" while read -r current_array; do ${r_cli} -D /dev/${current_array} >/dev/shm/.sw_carray.tmp echo -n "Array:${current_array}:" # get array details ar_state=$(grep -oP "(?<=State : )[a-zA-Z]*" /dev/shm/.sw_carray.tmp) ar_active_dev=$(grep "Active Devices :" /dev/shm/.sw_carray.tmp|awk '{print $NF}') ar_failed_dev=$(grep "Failed Devices :" /dev/shm/.sw_carray.tmp|awk '{print $NF}') ar_level=$(grep "Raid Level :" /dev/shm/.sw_carray.tmp|awk '{print $NF}') ar_removed=$(grep "removed" /dev/shm/.sw_carray.tmp|awk '{print $NF}'|wc -l) if [[ "${ar_state}" = "active" ]] || [[ "${ar_state}" = "clean" ]] || [[ "${ar_state}" = "inactive" ]] || [[ "${ar_level}" = "container" ]] && [[ ${ar_removed} -eq 0 ]] && [[ ${ar_failed_dev} -eq 0 ]]; then if [[ ${ar_level} = "container" ]]; then ar_state=container ; ar_failed_dev=NA fi echo -n "Health: OK (state/failed_dev/removed_dev: ${ar_state}/${ar_failed_dev}/${ar_removed});" else echo -n "Health: CRITICAL (state/failed_dev/removed_dev: ${ar_state}/${ar_failed_dev}/${ar_removed});" let is_critical+=1 fi done <<< "${sw_arrays}" } function check_zfs_pools() { if [[ -x ${bin_zpool} ]]; then zfs_pools=$(${bin_zpool} list -H -o name,size,health) if [[ ! -z ${zfs_pools} ]]; then echo -n "[ZFS]::" while read -r current_pool; do # get pool details pool_name=$(echo ${current_pool}|awk '{print $1}') ; pool_size=$(echo ${current_pool}|awk '{print $2}') pool_health=$(echo ${current_pool}|awk '{print $3}') if [[ "${pool_health}" = "ONLINE" ]]; then echo -n "Health: OK (name/size/health: ${pool_name}/${pool_size}/${pool_health});" else echo -n "Health: CRITICAL (name/size/health: ${pool_name}/${pool_size}/${pool_health});" let is_critical+=1 fi done <<< "${zfs_pools}" else true fi else true fi } function check_nvme() { echo -n "[NVME]::" # exclude fd, sr and ram devices to avoid timeout waits nvme_devices=$(${bin_lsblk} -e 1,2,11 -nd --output NAME|grep ^nvme|sort -n) while read -r current_nvme; do ${bin_nvme} smart-log /dev/${current_nvme} >/dev/shm/.nvme_cdrv # https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf 5.14.1.2 # cw - Critical Warning (>1) # pu - Percentage USed (>95) # me - Media Errors (>49) s_cw=$(grep "^critical_warning\s" /dev/shm/.nvme_cdrv|awk '{print $NF}') s_pu=$(grep "^percentage_used\s" /dev/shm/.nvme_cdrv|awk '{print $NF}'|cut -d'%' -f1) s_me=$(grep "^media_errors\s" /dev/shm/.nvme_cdrv|awk '{print $NF}') echo -n "drv:/dev/${current_nvme}:" if [[ ${s_cw} -lt 2 ]] && [[ ${s_pu} -lt 95 ]] && [[ ${s_me} -lt 50 ]]; then echo -n "Health OK (cw/pu/me: ${s_cw}/${s_pu}/${s_me});" else echo -n "Health: CRITICAL ((cw/pu/me: ${s_cw}/${s_pu}/${s_me});" let is_critical+=1 fi done <<< "${nvme_devices}" } function check_drives() { # smartctl shipped with CentOS 5.7 (ie vpsnode1) doesn't have --scan option valid_sctl=$(${bin_smartctl} -h|grep scan$) if [[ -z ${valid_sctl} ]]; then echo -n "* * * drive checks skipped, no --scan support in smartctl, please update smartmontools * * *" let is_critical+=1 nagios_states fi # default IDs ${bin_smartctl} --scan|awk '{print $1}' >/dev/shm/.sw_drv_list.tmp drive_ids=$(cat /dev/shm/.sw_drv_list.tmp) if [[ "${sctl}" = "lsi" ]]; then ${bin_smartctl} --scan >/dev/shm/.smart_lsi.tmp is_megaraid=$(grep -ql "/dev/bus" /dev/shm/.smart_lsi.tmp;echo $?) fi # 3ware require '-d 3ware,N' for smartctl if [[ "${sctl}" = "3ware" ]]; then drive_ids=$(cat /dev/shm/.3w_drive_list.tmp|awk '{print $1}'|sed 's/^p//g') fi # LSI require '-d megaraid,N for smartctl on some hosts if [[ ${sctl} = "lsi" ]]; then if [[ ${is_megaraid} -eq 0 ]]; then drive_ids=$(grep "/dev/bus" /dev/shm/.smart_lsi.tmp|awk -F" #" '{print $1}') else drive_ids=$(cat /dev/shm/.smart_lsi.tmp|awk '{print $1}') fi fi # check SMART 0x05 value function check_sattr() { if [[ -z ${err_realloc} ]]; then # query the device just once ${bin_smartctl} ${1} >/dev/shm/.drv_vendor s_vendor=$(grep -P "^Vendor:\s" /dev/shm/.drv_vendor|awk '{print $NF}') if [[ ! ${s_vendor} = "SEAGATE" ]]; then s_vendor=$(grep -P "^Device Model:" /dev/shm/.drv_vendor|awk '{print $3}') fi # identify Seagate if [[ ${s_vendor} = "SEAGATE" ]]; then # notify about alternative verification if it's other than Realloc counter test verification_type=SGT # switch SMART query to Health new_query=$(echo ${1}|sed 's/-i/-H/g') # get Seagate health status via 'SMART Health Status' since it lacks the usual values sgt_health=$(${bin_smartctl} ${new_query}|grep "Health Status: "|awk -F": " '{print $NF}') err_realloc="SEAGATE:${sgt_health}" return 0 fi # identify PNY if [[ ${s_vendor} = "PNY" ]]; then verification_type=PNY # switch SMART query to ALL new_query=$(echo ${1}|sed 's/-i/-a/g') # get PNY health status via SMART 231 attribute, the 'common' name for 231 is incorrect # (https://www.smartmontools.org/ticket/1281) but the attribute itself is returning correct Life Left # value, all PNY devices will be checked that way but the bug was found on PNY C900 model pny_health=$(${bin_smartctl} ${new_query}|grep -P "^231\s"|awk '{print $NF}') # Life Left, % err_realloc="${pny_health}" return 0 fi err_realloc="S_NOATTR"; fi } # smart 0x05 value is named differently depending on the drive model, define the names here to avoid # long grep lines within 'current_drive' s5_1=Reallocated_Sector_Ct s5_2=Retired_Block_Count # SSD s5_3=Reallocate_NAND_Blk_Cnt echo -n "[STORAGE]" while read -r current_drive; do if [[ ${sctl} = "3ware" ]]; then # check whether SMART is enabled ${bin_smartctl} -d 3ware,${current_drive} -a /dev/tw?0 >/dev/shm/.sw_smctl_dump.tmp is_smart=$(grep "SMART support is:\s*Disabled$" /dev/shm/.sw_smctl_dump.tmp|awk '{print $NF}') if [[ -z "${is_smart}" ]]; then err_realloc=$(${bin_smartctl} -d 3ware,${current_drive} -a /dev/tw?0|egrep "(${s5_1}|${s5_2}|${s5_3})"|awk '{print $NF}') check_sattr "-d 3ware,${current_drive} -i /dev/tw?0" else err_realloc="SMART_disabled" fi elif [[ ${sctl} = "lsi" ]]; then # check whether SMART is enabled ${bin_smartctl} -a ${current_drive} >/dev/shm/.sw_smctl_dump.tmp is_smart=$(grep "SMART support is:\s*Disabled$" /dev/shm/.sw_smctl_dump.tmp|awk '{print $NF}') if [[ -z "${is_smart}" ]]; then err_realloc=$(${bin_smartctl} -a ${current_drive} |egrep "(${s5_1}|${s5_2}|${s5_3})"|awk '{print $NF}') check_sattr "-i ${current_drive}" else err_realloc="SMART_disabled" fi else # check whether SMART is enabled ${bin_smartctl} -a ${current_drive} >/dev/shm/.sw_smctl_dump.tmp is_smart=$(grep "SMART support is:\s*Disabled$" /dev/shm/.sw_smctl_dump.tmp|awk '{print $NF}') if [[ -z "${is_smart}" ]]; then if [[ ${sctl} = "Areca" ]]; then err_realloc=$(echo $(smartctl -a ${current_drive} |egrep '(^read:|^write:)'|awk '{print $NF}')|sed 's/ /+/g'|bc) else err_realloc=$(${bin_smartctl} -a ${current_drive} |egrep "(${s5_1}|${s5_2}|${s5_3})"|awk '{print $NF}') check_sattr "-i ${current_drive}" fi else err_realloc="SMART_disabled" fi fi if [[ ${sctl} = "lsi" ]]; then current_drive=$(echo ${current_drive}|cut -d',' -f2); fi echo -n "drv:${current_drive}:" # Seagate health if [[ ${verification_type} = "SGT" ]]; then unset verification_type if [[ "${err_realloc}" = "SEAGATE:OK" ]] && [[ -z "${is_smart}" ]]; then echo -n "Health: OK (SMART Health status: ${err_realloc});" else echo -n "Health: CRITICAL (SMART Health status: ${err_realloc});" let is_critical=1 fi # PNY health elif [[ ${verification_type} = "PNY" ]]; then unset verification_type if [[ ${err_realloc} -gt 0 ]] && [[ -z "${is_smart}" ]]; then echo -n "Health: OK (231 (Life Left %): ${err_realloc});" else echo -n "Health: CRITICAL (231 (Life Left %): ${err_realloc});" let is_critical=1 fi else # Generic verification if [[ ! ${err_realloc} -gt ${max_realloc_cnt} ]] && [[ -z "${is_smart}" ]]; then echo -n "Health: OK (realloc: ${err_realloc});" else echo -n "Health: CRITICAL (realloc: ${err_realloc});" let is_critical+=1 fi fi done <<< "${drive_ids}" if [[ ${is_nvme} -eq 1 ]]; then if [[ ! -x ${bin_nvme} ]]; then echo "fatal: found NVMe storage but nvme tool not found on \$PATH, please install it, exiting" ; exit 2 fi check_nvme fi } function check_sw_raid_if_found() { sw_arrays=$(grep -o "^md[_,a-z,0-9]*" /proc/mdstat|head -1) if [[ ! -z ${sw_arrays} ]]; then software_raid_check fi } function nagios_states() { # nagios states if [[ -z ${hw_errcnt} ]]; then hw_errcnt=0; fi if [[ -z ${sw_errcnt} ]]; then sw_errcnt=0; fi eall_cnt=$((hw_errcnt+sw_errcnt)) if [[ ${is_critical} -gt 0 ]] || [[ ${hw_errcnt} -gt 0 ]]; then if [[ ${arg1} = "show_error_counters" ]]; then echo -e "\nnagios_status: ${STATE_CRITICAL}, hw_error_count: ${hw_errcnt}, sw_error_count: ${sw_errcnt}, all_errors: ${eall_cnt}"; fi exit ${STATE_CRITICAL} else if [[ ${arg1} = "show_error_counters" ]]; then echo -e "\nnagios_status: ${STATE_OK}, hw_error_count: ${hw_errcnt}, sw_error_count: ${sw_errcnt}, all_errors: ${eall_cnt}"; fi exit ${STATE_OK} fi } ## Main ## # sometimes non-root user's PATH is restricted to non-sbin dirs # and the root's $PATH isn't exported when invoked via sudo export PATH=$PATH:/sbin:/usr/sbin:/usr/local/sbin bin_lspci=$(which 2>/dev/null lspci) bin_lsblk=$(which 2>/dev/null lsblk) bin_zpool=$(which 2>/dev/null zpool) bin_nvme=$(which 2>/dev/null nvme) max_realloc_cnt=49 max_media_err_cnt=49 if [[ ! -x ${bin_lspci} ]]; then echo "fatal: unable to find lspci tool on \$PATH, please install it, exiting" ; exit 2 fi if [[ ! -x ${bin_lsblk} ]]; then #echo "fatal: unable to find lsblk tool on \$PATH, please install it, exiting" ; exit 2 # there's number of problems with lsblk on centos 4/5, attempt to check manually if missing hw_nvme=$(for n in $(find /sys/block -maxdepth 1|grep /nvme); do basename ${n};done |grep -Pql "^nvme";echo $?) fi sw_arrays=$(grep -o "^md[_,a-z,0-9]*" /proc/mdstat) bin_smartctl=$(which 2>/dev/null smartctl) if [[ -z ${bin_smartctl} ]]; then echo "smartctl missing, exiting" ; exit 1 fi # detect NVMe drives via lsblk if [[ -x ${bin_lsblk} ]]; then hw_nvme=$(${bin_lsblk} -nd --output name|grep -Pql ^nvme;echo $?) fi if [[ ${hw_nvme} -eq 0 ]]; then is_nvme=1 fi if [[ ! -z ${sw_arrays} ]]; then bin_mdadm=$(which 2>/dev/null mdadm) if [[ -z ${bin_mdadm} ]]; then echo "mdadm missing, exiting" ; exit 1 fi fi hw_card_present=$(${bin_lspci} |grep "RAID"|grep Intel -v|wc -l) arg1=$1 if [[ -z ${hw_errcnt} ]]; then hw_errcnt=0; fi if [[ -z ${sw_errcnt} ]]; then sw_errcnt=0; fi if [[ ${hw_card_present} -gt 0 ]]; then hw_card_model=$(${bin_lspci} |grep RAID|grep Intel -v|head -1|awk '{print $5}') if [[ ${hw_card_model} = "LSI" ]] || [[ ${hw_card_model} = "Broadcom" ]]; then lsi_hw_raid_check hw_errcnt=${is_critical} # LSI on some hosts require '-d megaraid,N' for smartctl sctl=lsi check_sw_raid_if_found check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states elif [[ ${hw_card_model} = "Areca" ]]; then # different SMART output format sctl=Areca areca_hw_raid_check hw_errcnt=${is_critical} check_sw_raid_if_found check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states elif [[ ${hw_card_model} = "3ware" ]]; then 3ware_hw_raid_check hw_errcnt=${is_critical} # 3ware require '-d 3ware,N' for smartctl sctl=3ware check_sw_raid_if_found check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states elif [[ ${hw_card_model} = "Adaptec" ]]; then bin_p2g=$(which 2>/dev/null pcre2grep) if [[ ! -x ${bin_p2g} ]]; then echo "fatal: unable to find pcre2grep tool on \$PATH, please install it, exiting" ; exit 2 else adaptec_hw_raid_check hw_errcnt=${is_critical} check_sw_raid_if_found check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states fi else echo -n "[HWR]:Found unsupported RAID card ;; " check_sw_raid_if_found check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states fi else # drive checks has been separated from software_raid_check() and check_drives() created instead # for drive checks on non-RAID setups check_sw_raid_if_found if [[ -z ${is_critical} ]]; then is_critical=0 ; fi check_zfs_pools check_drives sw_errcnt=${is_critical} nagios_states fi