#!/bin/bash # Copyright (C) 2019 Checkmk GmbH - License: GNU General Public License v2 # This file is part of Checkmk (https://checkmk.com). It is subject to the terms and # conditions defined in the file COPYING, which is part of this source code package. # # BEGIN COMMON AGENT CODE # usage() { cat </dev/null 2>&1 } get_file_atime() { stat -c %X "${1}" 2>/dev/null || stat -f %a "${1}" 2>/dev/null || perl -e 'if (! -f $ARGV[0]){die "0000000"};$atime=(stat($ARGV[0]))[8];print $atime."\n";' "${1}" } get_file_mtime() { stat -c %Y "${1}" 2>/dev/null || stat -f %m "${1}" 2>/dev/null || perl -e 'if (! -f $ARGV[0]){die "0000000"};$mtime=(stat($ARGV[0]))[9];print $mtime."\n";' "${1}" } is_valid_plugin() { # test if a file is executable and does not have certain # extensions (remnants from distro upgrades). case "${1:?No plugin defined}" in *.dpkg-new | *.dpkg-old | *.dpkg-temp | *.dpkg-tmp) return 1 ;; *) [ -f "${1}" ] && [ -x "${1}" ] ;; esac } set_up_process_commandline_arguments() { while [ -n "${1}" ]; do case "${1}" in -d | --debug) set -xv DISABLE_STDERR=false shift ;; -p | --profile) LOG_SECTION_TIME=true # disable caching to get the whole execution time DISABLE_CACHING=true shift ;; --force-inventory) export MK_FORCE_INVENTORY=true shift ;; -h | --help) usage exit 1 ;; *) shift ;; esac done } set_up_get_epoch() { # On some systems date +%s returns a literal %s if date +%s | grep "^[0-9].*$" >/dev/null 2>&1; then get_epoch() { date +%s; } else # do not check whether perl is even present. # in weird cases we may be fine without get_epoch. get_epoch() { perl -e 'print($^T."\n");'; } fi } set_up_current_shell() { # Note the current shell may not be the same as what is specified in the # shebang, e.g. when reconfigured in the xinetd/systemd/whateverd config file CURRENT_SHELL="$(ps -o args= -p $$ | cut -d' ' -f1)" } # # END COMMON AGENT CODE # set_variable_defaults() { # some 'booleans' [ "${MK_RUN_SYNC_PARTS}" = "false" ] || MK_RUN_SYNC_PARTS=true [ "${MK_RUN_ASYNC_PARTS}" = "false" ] || MK_RUN_ASYNC_PARTS=true # WATCH OUT: These 5 lines are searched for and replaced by the # agent bakery! # TODO: CMK-8339 (proper configuration) : "${MK_LIBDIR:="/usr/lib/check_mk_agent"}" : "${MK_CONFDIR:="/etc/check_mk"}" : "${MK_VARDIR:="/var/lib/check_mk_agent"}" : "${MK_LOGDIR:="/var/log/check_mk_agent"}" : "${MK_BIN:="/usr/bin"}" export MK_LIBDIR export MK_CONFDIR export MK_VARDIR export MK_LOGDIR export MK_BIN # Optionally set a tempdir for all subsequent calls #export TMPDIR= # All executables in PLUGINSDIR will simply be executed and their # ouput appended to the output of the agent. Plugins define their own # sections and must output headers with '<<<' and '>>>' PLUGINSDIR=${MK_LIBDIR}/plugins # All executables in LOCALDIR will by executabled and their # output inserted into the section <<>>. Please # refer to online documentation for details about local checks. LOCALDIR=${MK_LIBDIR}/local # All files in SPOOLDIR will simply appended to the agent # output if they are not outdated (see below) SPOOLDIR=${MK_VARDIR}/spool } set_up_path() { _PATH="${1}" # Make sure that locally installed binaries are found # Only add binaries if they are not already in the path! If you append to path in a loop the process will # eventually each the 128k size limit for the environment and become a zombie process. See execve manpage. [ "${_PATH#*"/usr/local/bin"}" != "${_PATH}" ] || _PATH="${_PATH}:/usr/local/bin" [ -n "${MK_BIN}" ] && { [ "${_PATH#*"${MK_BIN}"}" != "${_PATH}" ] || _PATH="${_PATH}:${MK_BIN}"; } [ -d "/var/qmail/bin" ] && { [ "${_PATH#*"/var/qmail/bin"}" != "${_PATH}" ] || _PATH="${_PATH}:/var/qmail/bin"; } echo "${_PATH}" unset _PATH } set_up_remote() { # Provide information about the remote host. That helps when data # is being sent only once to each remote host. REMOTE="${REMOTE_HOST:-"${REMOTE_ADDR:-"${SSH_CLIENT%% *}"}"}" # If none of the above are set *and* we are configured to, try to read it from stdin [ -z "${REMOTE}" ] && [ "${MK_READ_REMOTE}" = "true" ] && read -r REMOTE export REMOTE } announce_remote() { # let RTCs know about this remote [ -d "${MK_VARDIR}/rtc_remotes" ] || mkdir "${MK_VARDIR}/rtc_remotes" [ -n "${REMOTE}" ] && [ "${REMOTE}" != "push-connection" ] && touch "${MK_VARDIR}/rtc_remotes/${REMOTE}" } # # BEGIN COMMON AGENT CODE # # SC2089: Quotes/backslashes will be treated literally. Use an array. # shellcheck disable=SC2089 MK_DEFINE_LOG_SECTION_TIME='_log_section_time() { "$@"; }' finalize_profiling() { :; } set_up_profiling() { PROFILING_CONFIG="${MK_CONFDIR}/profiling.cfg" if [ -e "${PROFILING_CONFIG}" ]; then # Config vars: # LOG_SECTION_TIME=true/false # DISABLE_CACHING=true/false # If LOG_SECTION_TIME=true via profiling.cfg do NOT disable caching in order # to get the real execution time during operation. # shellcheck disable=SC1090 . "${PROFILING_CONFIG}" fi PROFILING_LOGFILE_DIR="${MK_LOGDIR}/profiling/$(date +%Y%m%d_%H%M%S)" if ${LOG_SECTION_TIME:-false}; then mkdir -p "${PROFILING_LOGFILE_DIR}" agent_start="$(perl -MTime::HiRes=time -le 'print time()')" # SC2016: Expressions don't expand in single quotes, use double quotes for that. # SC2089: Quotes/backslashes will be treated literally. Use an array. # shellcheck disable=SC2016,SC2089 MK_DEFINE_LOG_SECTION_TIME='_log_section_time() { section_func="$@" base_name=$(echo "${section_func}" | sed "s/[^A-Za-z0-9.-]/_/g") profiling_logfile="'"${PROFILING_LOGFILE_DIR}"'/${base_name}.log" start="$(perl -MTime::HiRes=time -le "print time()")" { time ${section_func}; } 2>> "${profiling_logfile}" echo "runtime $(perl -MTime::HiRes=time -le "print time() - ${start}")" >> "${profiling_logfile}" }' finalize_profiling() { pro_log_file="${PROFILING_LOGFILE_DIR}/profiling_check_mk_agent.log" agent_end="$(perl -MTime::HiRes=time -le 'print time()')" echo "runtime $(echo "${agent_end} - ${agent_start}" | bc)" >>"${pro_log_file}" } fi eval "${MK_DEFINE_LOG_SECTION_TIME}" # SC2090: Quotes/backslashes in this variable will not be respected. # shellcheck disable=SC2090 export MK_DEFINE_LOG_SECTION_TIME } unset_locale() { # eliminate localized outputs where possible # The locale logic here is used to make the Python encoding detection work (see CMK-2778). unset -v LANG LC_ALL if inpath locale && inpath paste; then # match C.UTF-8 at the beginning, but not e.g. es_EC.UTF-8! case "$(locale -a | paste -sd ' ' -)" in *' C.UTF-8'* | 'C.UTF-8'*) LC_ALL="C.UTF-8" ;; *' C.utf8'* | 'C.utf8'*) LC_ALL="C.utf8" ;; esac fi LC_ALL="${LC_ALL:-C}" export LC_ALL } # # END COMMON AGENT CODE # read_python_version() { if inpath "${1}"; then version=$(${1} -c 'import sys; print("%s.%s"%(sys.version_info[0], sys.version_info[1]))') major=${version%%.*} minor=${version##*.} if [ "${major}" -eq "${2}" ] && [ "${minor}" -ge "${3}" ]; then echo "${1}" return 0 fi fi return 1 } detect_python() { PYTHON3=$(read_python_version python3 3 4 || read_python_version python 3 4) PYTHON2=$(read_python_version python2 2 6 || read_python_version python 2 6) if [ -f "${MK_CONFDIR}/python_path.cfg" ]; then # shellcheck source=/dev/null . "${MK_CONFDIR}/python_path.cfg" fi export PYTHON2 PYTHON3 if [ -z "${PYTHON2}" ] && [ -z "${PYTHON3}" ]; then NO_PYTHON=true elif [ -n "${PYTHON3}" ] && [ "$( ${PYTHON3} -c 'pass' >/dev/null 2>&1 echo $? )" -eq 127 ]; then WRONG_PYTHON_COMMAND=true elif [ -z "${PYTHON3}" ] && [ "$( ${PYTHON2} -c 'pass' >/dev/null 2>&1 echo $? )" -eq 127 ]; then WRONG_PYTHON_COMMAND=true fi } detect_container_environment() { if [ -f /.dockerenv ]; then IS_DOCKERIZED=1 elif grep container=lxc /proc/1/environ >/dev/null 2>&1; then # Works in lxc environment e.g. on Ubuntu bionic, but does not # seem to work in proxmox (see CMK-1561) IS_LXC_CONTAINER=1 elif grep 'lxcfs /proc/cpuinfo fuse.lxcfs' /proc/mounts >/dev/null 2>&1; then # Seems to work in proxmox IS_LXC_CONTAINER=1 else unset IS_DOCKERIZED unset IS_LXC_CONTAINER fi if [ -n "${IS_DOCKERIZED}" ] || [ -n "${IS_LXC_CONTAINER}" ]; then if [ "$(stat -fc'%t' /sys/fs/cgroup)" = "63677270" ]; then IS_CGROUP_V2=1 CGROUP_SECTION_SUFFIX="_cgroupv2" else unset IS_CGROUP_V2 unset CGROUP_SECTION_SUFFIX fi fi } # Prefer (relatively) new /usr/bin/timeout from coreutils against # our shipped waitmax. waitmax is statically linked and crashes on # some Ubuntu versions recently. if inpath timeout; then waitmax() { timeout "$@" } fi encryption_panic() { echo "<<>>" echo "EncryptionPanic: true" exit 1 } set_up_encryption() { # shellcheck source=agents/cfg_examples/encryption.cfg [ -f "${MK_CONFDIR}/encryption.cfg" ] && { . "${MK_CONFDIR}/encryption.cfg" || encryption_panic } define_optionally_encrypt "${ENCRYPTED:-"no"}" } define_optionally_encrypt() { # if things fail, make sure we don't accidentally send unencrypted data unset optionally_encrypt if [ "${1}" != "no" ]; then OPENSSL_VERSION=$(openssl version | awk '{print $2}' | awk -F . '{print (($1 * 100) + $2) * 100+ $3}') # Depending on the Checkmk version, a key of proper length (256 bits) is provided. # However, always use key derivation here (suitable for passwords of all lengths). if [ "${OPENSSL_VERSION}" -ge 10101 ]; then optionally_encrypt() { printf "03%s" "${2}" openssl enc -aes-256-cbc -md sha256 -iter 10000 -k "${1}" } elif [ "${OPENSSL_VERSION}" -ge 10000 ]; then optionally_encrypt() { printf "02%s" "${2}" openssl enc -aes-256-cbc -md sha256 -k "${1}" -nosalt } else optionally_encrypt() { printf "00%s" "${2}" openssl enc -aes-256-cbc -md md5 -k "${1}" -nosalt } fi else optionally_encrypt() { [ -n "${2}" ] && printf "99%s" "${2}" cat } fi } set_up_disabled_sections() { if [ -f "${MK_CONFDIR}/exclude_sections.cfg" ]; then # shellcheck source=agents/cfg_examples/exclude_sections.cfg . "${MK_CONFDIR}/exclude_sections.cfg" fi } export_utility_functions() { # At the time of writing of this function, the linux agent exports # some helper functions, so I consolidate those exports here. # I am not sure whether this is a good idea, though. # Their API is unstable. export -f run_mrpe export -f waitmax export -f run_cached } section_checkmk() { cat <>> Version: 2.2.0p23 AgentOS: linux Hostname: $(uname -n) AgentDirectory: ${MK_CONFDIR} DataDirectory: ${MK_VARDIR} SpoolDirectory: ${SPOOLDIR} PluginsDirectory: ${PLUGINSDIR} LocalDirectory: ${LOCALDIR} HERE # try to find only_from configuration if [ -n "${REMOTE_HOST}" ]; then # xinetd sed -n "/^service[[:space:]]*check-mk-agent/,/}/s/^[[:space:]]*only_from[[:space:]]*=[[:space:]]*\(.*\)/OnlyFrom: \1/p" /etc/xinetd.d/* | head -n1 elif inpath systemctl; then # systemd sed -n '/^IPAddressAllow/s/IPAddressAllow=/OnlyFrom: /p' "/usr/lib/systemd/system/check-mk-agent.socket" 2>/dev/null # NOTE: The above line just reads back the socket file we deployed ourselves. Systemd units can be altered by # other user defined unit files, so this *may* not be correct. A better way of doing this seemed to be querying # systemctl itself about the 'effective' property: # # systemctl show --property IPAddressAllow "check-mk-agent.socket" | sed 's/IPAddressAllow=/OnlyFrom: /' # # However this ("successfully") reports an empty list or '[unprintable]' on older systemd versions :-( fi # # BEGIN COMMON AGENT CODE # if [ -n "${NO_PYTHON}" ]; then python_fail_msg="No suitable python installation found." elif [ -n "${WRONG_PYTHON_COMMAND}" ]; then python_fail_msg="Configured python command not found." fi cat </dev/null >&2 || return printf "<<>>\n" cmk-agent-ctl status --json --no-query-remote } section_checkmk_agent_plugins() { printf "<<>>\n" printf "pluginsdir %s\n" "${PLUGINSDIR}" printf "localdir %s\n" "${LOCALDIR}" for script in \ "${PLUGINSDIR}"/* \ "${PLUGINSDIR}"/[1-9]*/* \ "${LOCALDIR}"/* \ "${LOCALDIR}"/[1-9]*/*; do if is_valid_plugin "${script}"; then script_version=$(grep -e '^__version__' -e '^CMK_VERSION' "${script}" || echo 'CMK_VERSION="unversioned"') printf "%s:%s\n" "${script}" "${script_version}" fi done } section_checkmk_failed_plugin() { ${MK_RUN_SYNC_PARTS} || return echo "<<>>" echo "FailedPythonPlugins: ${1}" } # # END COMMON AGENT CODE # # # CHECK SECTIONS # section_labels() { echo '<<>>' if [ -n "${IS_DOCKERIZED}" ] || [ -n "${IS_LXC_CONTAINER}" ]; then echo '{"cmk/device_type":"container"}' elif grep "hypervisor" /proc/cpuinfo >/dev/null 2>&1; then echo '{"cmk/device_type":"vm"}' fi } section_mem() { if [ -n "${IS_DOCKERIZED}" ]; then echo "<<>>" if [ -n "${IS_CGROUP_V2}" ]; then cat /sys/fs/cgroup/memory.stat echo "memory.current $(cat /sys/fs/cgroup/memory.current)" echo "memory.max $(cat /sys/fs/cgroup/memory.max)" else cat /sys/fs/cgroup/memory/memory.stat echo "usage_in_bytes $(cat /sys/fs/cgroup/memory/memory.usage_in_bytes)" echo "limit_in_bytes $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)" fi grep -F 'MemTotal:' /proc/meminfo elif [ -n "${IS_LXC_CONTAINER}" ]; then echo '<<>>' grep -v -E '^Swap:|^Mem:|total:|^Vmalloc|^Committed' >>' grep -v -E '^Swap:|^Mem:|total:' >>' echo "$(cat /proc/loadavg) ${NUM_CPUS}" if [ -f "/proc/sys/kernel/threads-max" ]; then cat /proc/sys/kernel/threads-max fi else if [ -n "${IS_DOCKERIZED}" ]; then echo "<<>>" else echo "<<>>" fi if [ -n "${IS_CGROUP_V2}" ]; then echo "uptime $(cat /proc/uptime)" echo "num_cpus ${NUM_CPUS}" cat /sys/fs/cgroup/cpu.stat else grep "^cpu " /proc/stat echo "num_cpus ${NUM_CPUS}" cat /sys/fs/cgroup/cpuacct/cpuacct.stat fi fi } section_uptime() { echo '<<>>' if [ -z "${IS_DOCKERIZED}" ]; then cat /proc/uptime else echo "$(($(get_epoch) - $(stat -c %Z /dev/pts)))" fi } # Print out Partitions / Filesystems. (-P gives non-wrapped POSIXed output) # Heads up: NFS-mounts are generally supressed to avoid agent hangs. # If hard NFS mounts are configured or you have too large nfs retry/timeout # settings, accessing those mounts from the agent would leave you with # thousands of agent processes and, ultimately, a dead monitored system. # These should generally be monitored on the NFS server, not on the clients. section_df() { if [ -n "${IS_DOCKERIZED}" ]; then return fi # The exclusion list is getting a bit of a problem. # -l should hide any remote FS but seems to be all but working. excludefs="-x smbfs -x cifs -x iso9660 -x udf -x nfsv4 -x nfs -x mvfs -x prl_fs -x squashfs -x devtmpfs -x autofs -x beegfs" if [ -z "${IS_LXC_CONTAINER}" ]; then excludefs="${excludefs} -x zfs" fi echo '<<>>' # We really *need* word splitting below! # shellcheck disable=SC2086 df -PTlk ${excludefs} | sed 1d # df inodes information echo '<<>>' echo '[df_inodes_start]' # We really *need* word splitting below! # shellcheck disable=SC2086 df -PTli ${excludefs} | sed 1d echo '[df_inodes_end]' if inpath lsblk; then echo "[df_lsblk_start]" lsblk --list --paths --output NAME,UUID echo "[df_lsblk_end]" fi } section_systemd() { if inpath systemctl; then echo '<<>>' # use plain to force ASCII output that is simpler to parse echo "[list-unit-files]" systemctl list-unit-files --full --no-legend --no-pager --plain --type service --type socket | tr -s ' ' echo "[status]" systemctl status --all --type service --type socket --no-pager --lines 0 | tr -s ' ' echo "[all]" systemctl --all --type service --type socket --full --no-legend --no-pager --plain | sed '/^$/q' | tr -s ' ' fi } section_zfs() { if inpath zfs; then echo '<<>>' zfs get -t filesystem,volume -Hp name,quota,used,avail,mountpoint,type 2>/dev/null echo '<<>>' echo '[df]' df -PTlk -t zfs | sed 1d fi } section_nfs_mounts() { if inpath waitmax; then STAT_VERSION=$(stat --version | head -1 | cut -d" " -f4) STAT_BROKE="5.3.0" json_templ() { echo '{"mountpoint": "'"${1}"'", "source": "'"${2}"'", "state": "ok", "usage": {"total_blocks": %b, "free_blocks_su": %f, "free_blocks": %a, "blocksize": %s}}' } json_templ_empty() { echo '{"mountpoint": "'"${1}"'", "source": "'"${2}"'", "state": "hanging", "usage": {"total_blocks": 0, "free_blocks_su": 0, "free_blocks": 0, "blocksize": 0}}' } echo '<<>>' sed -n '/ nfs4\? /s/\([^ ]*\) \([^ ]*\) .*/\1 \2/p' >>' sed -n -e '/ cifs /s/.*\ \([^ ]*\)\ cifs\ .*/\1/p' >>' grep ^/dev >>' echo "[time]" get_epoch echo "[processes]" CGROUP="" if [ -e /sys/fs/cgroup ]; then CGROUP="cgroup:512," fi echo "[header] $(ps ax -ww -o "${CGROUP}"user:32,vsz,rss,cputime,etime,pid,command | tr -s ' ')" fi } section_lnx_if() { if inpath ip; then echo '<<>>' echo "[start_iplink]" ip address echo "[end_iplink]" fi echo '<<>>' sed 1,2d /proc/net/dev sed -e 1,2d /proc/net/dev | cut -d':' -f1 | sort | while read -r eth; do echo "[${eth}]" if inpath ethtool; then ethtool "${eth}" | grep -E '(Speed|Duplex|Link detected|Auto-negotiation):' else # If interface down we get "Invalid argument" speed=$(cat "/sys/class/net/${eth}/speed" 2>/dev/null) if [ -n "${speed}" ] && [ "${speed}" -ge 0 ]; then echo "Speed: ${speed}Mb/s" fi fi echo "Address: $(cat "/sys/class/net/${eth}/address")" done } section_bonding_interfaces() { ( cd /proc/net/bonding 2>/dev/null || return echo '<<>>' head -v -n 1000 ./* ) } section_vswitch_bonding() { if inpath ovs-appctl; then BONDS=$(ovs-appctl bond/list) COL=$(echo "${BONDS}" | awk '{for(i=1;i<=NF;i++) {if($i == "bond") printf("%d", i)} exit 0}') echo '<<>>' for bond in $(echo "${BONDS}" | sed -e 1d | cut -f"${COL}"); do echo "[${bond}]" ovs-appctl bond/show "${bond}" done fi } section_tcp() { if inpath waitmax; then echo '<<>>' if OUTPUT=$(waitmax 5 cat /proc/net/tcp /proc/net/tcp6 2>/dev/null | awk ' /:/ { c[$4]++; } END { for (x in c) { print x, c[x]; } }'); then echo "${OUTPUT}" elif inpath ss; then ss -ant | grep -v ^State | awk ' /:/ { c[$1]++; } END { for (x in c) { print x, c[x]; } }' | sed -e 's/^ESTAB/01/g;s/^SYN-SENT/02/g;s/^SYN-RECV/03/g;s/^FIN-WAIT-1/04/g;s/^FIN-WAIT-2/05/g;s/^TIME-WAIT/06/g;s/^CLOSED/07/g;s/^CLOSE-WAIT/08/g;s/^LAST-ACK/09/g;s/^LISTEN/0A/g;s/^CLOSING/0B/g;' fi fi } section_multipathing() { if inpath multipath; then echo '<<>>' multipath -l fi } section_diskstat() { if [ -z "${IS_DOCKERIZED}" ]; then echo '<<>>' get_epoch grep -E ' (x?[shv]d[a-z]*[0-9]*|cciss/c[0-9]+d[0-9]+|emcpower[a-z]+|dm-[0-9]+|VxVM.*|mmcblk.*|dasd[a-z]*|bcache[0-9]+|nvme[0-9]+n[0-9]+) ' >>" echo "[time]" get_epoch if [ -n "${IS_CGROUP_V2}" ]; then echo "[io.stat]" cat "/sys/fs/cgroup/io.stat" else for F in io_service_bytes io_serviced; do echo "[${F}]" cat "/sys/fs/cgroup/blkio/blkio.throttle.${F}" done fi echo "[names]" for F in /sys/block/*; do echo "${F##*/} $(cat "${F}/dev")" done fi } section_chrony() { if inpath chronyc; then # Force successful exit code. Otherwise section will be missing if daemon not running # # The "| cat" has been added for some kind of regression in RedHat 7.5. The # SELinux rules shipped with that release were denying the chronyc call # without cat. _run_cached_internal "chrony" 30 120 200 20 "echo '<<>>'; waitmax 5 chronyc -n tracking | cat || true" fi } section_kernel() { if [ -z "${IS_DOCKERIZED}" ] && [ -z "${IS_LXC_CONTAINER}" ]; then echo '<<>>' get_epoch cat /proc/vmstat /proc/stat fi } section_ipmitool() { if inpath ipmitool; then _run_cached_internal "ipmi" 300 300 900 600 "echo '<<>>'; waitmax 300 ipmitool sensor list | grep -v 'command failed' | grep -v -E '^[^ ]+ na ' | grep -v ' discrete '" # readable discrete sensor states _run_cached_internal "ipmi_discrete" 300 300 900 600 "echo '<<>>'; waitmax 300 ipmitool sdr elist compact" fi } section_ipmisensors() { inpath ipmi-sensors && ls /dev/ipmi* >/dev/null || return ${MK_RUN_SYNC_PARTS} && echo '<<>>' # Newer ipmi-sensors version have new output format; Legacy format can be used if ipmi-sensors --help | grep -q legacy-output; then IPMI_FORMAT="--legacy-output" else IPMI_FORMAT="" fi if ipmi-sensors --help | grep -q " \-\-groups"; then IPMI_GROUP_OPT="-g" else IPMI_GROUP_OPT="-t" fi # At least with ipmi-sensors 0.7.16 this group is Power_Unit instead of "Power Unit" _run_cached_internal "ipmi_sensors" 300 300 900 600 "echo '<<>>'; for class in Temperature Power_Unit Fan; do ipmi-sensors ${IPMI_FORMAT} --sdr-cache-directory /var/cache ${IPMI_GROUP_OPT} \"\${class}\" | sed -e 's/ /_/g' -e 's/:_\?/ /g' -e 's@ \([^(]*\)_(\([^)]*\))@ \2_\1@' # In case of a timeout immediately leave loop. if [ $? = 255 ]; then break; fi done" } section_md() { echo '<<>>' cat /proc/mdstat } section_dm_raid() { if inpath dmraid && DMSTATUS=$(waitmax 3 dmraid -r); then echo '<<>>' # Output name and status waitmax 20 dmraid -s | grep -e ^name -e ^status # Output disk names of the RAID disks DISKS=$(echo "${DMSTATUS}" | cut -f1 -d":") for disk in ${DISKS}; do device=$(cat /sys/block/"$(basename "${disk}")"/device/model) status=$(echo "${DMSTATUS}" | grep "^${disk}") echo "${status} Model: ${device}" done fi } section_cfggen() { if inpath cfggen; then echo '<<>>' cfggen 0 DISPLAY | grep -E '(Target ID|State|Volume ID|Status of volume)[[:space:]]*:' | sed -e 's/ *//g' -e 's/:/ /' fi } section_storcli() { if inpath storcli; then _storcli() { storcli "$@"; } elif inpath storcli64; then _storcli() { storcli64 "$@"; } else return 1 fi echo '<<>>' _storcli /call/eall/sall show all echo '<<>>' _storcli /call/vall show all echo '<<>>' _storcli /call/cv show all # exit successfully, because storcli was in the path. return 0 } section_megaraid() { section_storcli && return if inpath MegaCli; then MegaCli_bin="MegaCli" elif inpath MegaCli64; then MegaCli_bin="MegaCli64" elif inpath megacli; then MegaCli_bin="megacli" else return 1 fi echo '<<>>' for part in $(${MegaCli_bin} -EncInfo -aALL -NoLog >>' ${MegaCli_bin} -LDInfo -Lall -aALL -NoLog >>' ${MegaCli_bin} -AdpBbuCmd -GetBbuStatus -aALL -NoLog >>' tw_cli "/${C}" show all | grep -E 'Model =|Firmware|Serial' echo '<<<3ware_disks>>>' tw_cli "/${C}" show drivestatus | grep -E 'p[0-9]' | sed "s/^/${C}\//" echo '<<<3ware_units>>>' tw_cli "/${C}" show unitstatus | grep -E 'u[0-9]' | sed "s/^/${C}\//" done fi } section_areca_raid() { if inpath cli64; then _run_cached_internal "arc_raid_status" 300 300 900 600 "echo '<<>>'; cli64 rsf info | tail -n +3 | head -n -2" fi } section_vbox_guest() { echo '<<>>' if inpath VBoxControl && lsmod | grep vboxguest >/dev/null 2>&1; then (VBoxControl -nologo guestproperty enumerate || echo "ERROR") | cut -d, -f1,2 fi } section_openvpn() { if [ -e /etc/openvpn/openvpn-status.log ]; then echo '<<>>' sed -n -e '/CLIENT LIST/,/ROUTING TABLE/p' >>' for var in GPUErrors GPUCoreTemp; do DISPLAY=:0 waitmax 2 nvidia-settings -t -q ${var} | sed "s/^/${var}: /" done fi } section_drbd() { if [ -z "${IS_DOCKERIZED}" ] && [ -z "${IS_LXC_CONTAINER}" ] && [ -e /proc/drbd ]; then echo '<<>>' cat /proc/drbd cat /sys/kernel/debug/drbd/resources/*/connections/*/0/proc_drbd 2>/dev/null fi } section_heartbeat() { if [ -S /var/run/heartbeat/crm/cib_ro ] || [ -S /var/run/crm/cib_ro ] || pgrep "^(crmd|pacemaker-contr)$" >/dev/null 2>&1; then echo '<<>>' TZ=UTC crm_mon -1 -r | grep -v ^$ | sed 's/^ //; /^\sResource Group:/,$ s/^\s//; s/^\s/_/g' fi if inpath cl_status; then echo '<<>>' cl_status rscstatus echo '<<>>' for NODE in $(cl_status listnodes); do if [ "${NODE}" != "$(uname -n | tr '[:upper:]' '[:lower:]')" ]; then STATUS=$(cl_status nodestatus "${NODE}") printf "%s %s" "${NODE}" "${STATUS}" for LINK in $(cl_status listhblinks "${NODE}" 2>/dev/null); do printf " %s %s" "${LINK}" "$(cl_status hblinkstatus "${NODE}" "${LINK}")" done echo fi done fi } ## Postfix mailqueue monitoring ## Determine the number of mails and their size in several postfix mail queues read_postfix_queue_dirs() { postfix_queue_dir=${1} if [ -n "${postfix_queue_dir}" ]; then echo '<<>>' echo "[[[${2}]]]" for queue in deferred active; do count=$(find "${postfix_queue_dir}/${queue}" -type f | wc -l) size=$(du -s "${postfix_queue_dir}/${queue}" | awk '{print $1 }') if [ -z "${size}" ]; then size=0 fi if [ -z "${count}" ]; then echo "Mail queue is empty" else echo "QUEUE_${queue} ${size} ${count}" fi done fi } ## Postfix status monitoring read_postfix_master_pid() { postfix_queue_dir=${1} postfix_instance_name=${2:-postfix} echo "<<>>" if [ -e "${postfix_queue_dir}/pid/master.pid" ]; then if [ -r "${postfix_queue_dir}/pid/master.pid" ]; then postfix_pid=$(sed 's/ //g' <"${postfix_queue_dir}/pid/master.pid") # handle possible spaces in output if readlink -- "/proc/${postfix_pid}/exe" | grep -q ".*postfix/\(s\?bin/\)\?master.*"; then echo "${postfix_instance_name}:the Postfix mail system is running:PID:${postfix_pid}" else echo "${postfix_instance_name}:PID file exists but instance is not running!" fi else echo "${postfix_instance_name}:PID file exists but is not readable" fi else echo "${postfix_instance_name}:the Postfix mail system is not running" fi } ## Postfix mailqueue monitoring ## Determine the number of mails and their size in several postfix mail queue section_mailqueue() { if inpath postconf; then # Check if multi_instance_directories exists in main.cf and is not empty # always takes the last entry, multiple entries possible multi_instances_dirs=$(postconf -c /etc/postfix 2>/dev/null | grep ^multi_instance_directories | sed 's/.*=[[:space:]]*//g') if [ -n "${multi_instances_dirs}" ]; then for queue_dir in ${multi_instances_dirs}; do if [ -n "${queue_dir}" ]; then postfix_queue_dir=$(postconf -c "${queue_dir}" 2>/dev/null | grep ^queue_directory | sed 's/.*=[[:space:]]*//g') read_postfix_queue_dirs "${postfix_queue_dir}" "${queue_dir}" postfix_instance_name=$(postconf -c "${queue_dir}" -h multi_instance_name 2>/dev/null) read_postfix_master_pid "${postfix_queue_dir}" "${postfix_instance_name}" fi done fi # Always check for the default queue. It can exist even if multiple instances are configured read_postfix_queue_dirs "$(postconf -h queue_directory 2>/dev/null)" read_postfix_master_pid "$(postconf -h queue_directory 2>/dev/null)" elif [ -x /usr/sbin/ssmtp ]; then echo '<<>>' mailq 2>&1 | sed 's/^[^:]*: \(.*\)/\1/' | tail -n 6 fi # Check status of qmail mailqueue if inpath qmail-qstat; then echo "<<>>" qmail-qstat fi # Nullmailer queue monitoring if inpath nullmailer-send; then echo '<<>>' if [ -d /var/spool/nullmailer/queue ]; then COUNT=$(find /var/spool/nullmailer/queue -type f | wc -l) SIZE=$(du -s /var/spool/nullmailer/queue | awk '{print $1 }') echo "${SIZE} ${COUNT} deferred" fi if [ -d /var/spool/nullmailer/failed ]; then COUNT=$(find /var/spool/nullmailer/failed -type f | wc -l) SIZE=$(du -s /var/spool/nullmailer/failed | awk '{print $1 }') echo "${SIZE} ${COUNT} failed" fi fi } section_omd() { if inpath omd; then # 60 is _probably_ the agents polling interval. Why would you use that?? _run_cached_internal "omd_status" 60 60 180 120 "echo '<<>>'; omd status --bare || true" ${MK_RUN_SYNC_PARTS} || return echo '<<>>' get_epoch for statefile in /omd/sites/*/var/log/mknotifyd.state; do if [ -e "${statefile}" ]; then site=${statefile%/var/log*} site=${site#/omd/sites/} echo "[${site}]" grep -v '^#' <"${statefile}" fi done echo '<<>>' for statsfile in /omd/sites/*/var/log/apache/stats; do if [ -e "${statsfile}" ]; then site=${statsfile%/var/log*} site=${site#/omd/sites/} echo "[${site}]" cat "${statsfile}" : >"${statsfile}" # prevent next section to fail caused by a missing newline at the end of the statsfile echo fi done _du_no_errors() { if [ -e "${1}" ]; then output=$(du -bs "$1") && printf "%s\n" "${output}" else printf "0 %s\n" "${1}" fi } echo '<<>>' for sitedir in /omd/sites/*; do site=${sitedir#/omd/sites/} echo "[site ${site}]" _du_no_errors "$sitedir" _du_no_errors "$sitedir/var/log" _du_no_errors "$sitedir/var/check_mk/rrd" _du_no_errors "$sitedir/var/pnp4nagios/" _du_no_errors "$sitedir/tmp/" _du_no_errors "$sitedir/local/" _du_no_errors "$sitedir/var/check_mk/agents/" _du_no_errors "$sitedir/var/mkeventd/history/" _du_no_errors "$sitedir/var/check_mk/core/" _du_no_errors "$sitedir/var/check_mk/inventory_archive/" done echo '<<>>' echo '[versions]' echo 'version;number;edition;demo' for versiondir in /omd/versions/*; do version=${versiondir#/omd/versions/} # filter out special directory 'default' if [ "${version}" = "default" ]; then continue fi number=${version} demo="0" if [ "${version##*.}" = "demo" ]; then number=${version%.demo} demo="1" fi edition=${number##*.} number=${number%.*} echo "${version};${number};${edition};${demo}" done echo '[sites]' echo 'site;used_version;autostart' for sitedir in /omd/sites/*; do site=${sitedir#/omd/sites/} used_version=$(readlink "${sitedir}"/version) used_version=${used_version##*/} autostart="0" if grep -q "CONFIG_AUTOSTART[[:blank:]]*=[[:blank:]]*'on'" "${sitedir}"/etc/omd/site.conf; then autostart="1" fi echo "${site};${used_version};${autostart}" done fi } section_zpool() { if inpath zpool; then echo "<<>>" zpool status -x echo "<<>>" zpool list fi } section_veritas_cluster() { if [ -x /opt/VRTSvcs/bin/haclus ]; then echo "<<>>" vcshost=$(hostname | cut -d. -f1) waitmax -s 9 2 /opt/VRTSvcs/bin/haclus -display -localclus | grep -e ClusterName -e ClusState waitmax -s 9 2 /opt/VRTSvcs/bin/hasys -display -attribute SysState waitmax -s 9 2 /opt/VRTSvcs/bin/hagrp -display -sys "${vcshost}" -attribute State -localclus waitmax -s 9 2 /opt/VRTSvcs/bin/hares -display -sys "${vcshost}" -attribute State -localclus waitmax -s 9 2 /opt/VRTSvcs/bin/hagrp -display -attribute TFrozen -attribute Frozen fi } section_omd_core() { ( cd /omd/sites || return # The files within a site are site-user writable! Therefore we must not use them! # The version files are only root writable so we can use them instead. site_version() { printf "%s" "$(realpath "${1}/version" | sed 's|.*/||')" } site_cmd() { # DO NOT ACCESS /omd/sites/${site}/bin/cmd directly. # bin might point anywhere -> priv escalation. printf "/omd/versions/%s/bin/%s" "$(site_version "${1}")" "${2}" } site_lib() { printf "/omd/versions/%s/lib" "$(site_version "${1}")" } waitmax_for_unixcat_with_site_ld_library_path() { LD_LIBRARY_PATH="$(site_lib "${2}"):${LD_LIBRARY_PATH}" waitmax "${1}" "$(site_cmd "${2}" unixcat)" "/omd/sites/${2}/tmp/run/${3}" } echo '<<>>' for site in *; do if [ -S "/omd/sites/${site}/tmp/run/live" ]; then echo "[${site}]" echo "GET status" | waitmax_for_unixcat_with_site_ld_library_path 3 "${site}" "live" fi done echo '<<>>' for site in *; do echo "[${site}]" for PEM_PATH in "/omd/sites/${site}/etc/ssl/ca.pem" "/omd/sites/${site}/etc/ssl/sites/${site}.pem"; do if [ -f "${PEM_PATH}" ]; then CERT_DATE=$(openssl x509 -enddate -noout -in "${PEM_PATH}" | sed 's/notAfter=//') echo "${PEM_PATH}|$(date --date="${CERT_DATE}" --utc +%s)" fi done done echo '<<>>' for site in *; do if [ -S "/omd/sites/${site}/tmp/run/mkeventd/status" ]; then echo "[\"${site}\"]" (echo "GET status" && echo "OutputFormat: json") | waitmax_for_unixcat_with_site_ld_library_path 3 "${site}" "mkeventd/status" fi done echo '<<>>' for site in *; do if [ -S "/omd/sites/${site}/tmp/run/live" ]; then echo "[${site}]" waitmax_for_unixcat_with_site_ld_library_path 5 "${site}" "live" < 0 Filter: custom_variable_names < _REALNAME LimitString waitmax_for_unixcat_with_site_ld_library_path 5 "${site}" "live" < 0 Stats: host_scheduled_downtime_depth > 0 StatsOr: 2 Stats: scheduled_downtime_depth = 0 Stats: host_scheduled_downtime_depth = 0 Stats: host_state != 0 StatsAnd: 3 Stats: state = 1 Stats: scheduled_downtime_depth = 0 Stats: host_scheduled_downtime_depth = 0 Stats: host_state = 0 Stats: host_has_been_checked = 1 StatsAnd: 5 Stats: state = 3 Stats: scheduled_downtime_depth = 0 Stats: host_scheduled_downtime_depth = 0 Stats: host_state = 0 Stats: host_has_been_checked = 1 StatsAnd: 5 Stats: state = 2 Stats: scheduled_downtime_depth = 0 Stats: host_scheduled_downtime_depth = 0 Stats: host_state = 0 Stats: host_has_been_checked = 1 StatsAnd: 5 Filter: host_custom_variable_names < _REALNAME LimitString fi done ) } section_mkbackup() { if ls /omd/sites/*/var/check_mk/backup/*.state >/dev/null 2>&1; then echo "<<>>" for F in /omd/sites/*/var/check_mk/backup/*.state; do SITE=${F#/*/*/*} SITE=${SITE%%/*} JOB_IDENT=${F%.state} JOB_IDENT=${JOB_IDENT##*/} if [ "${JOB_IDENT}" != "restore" ]; then echo "[[[site:${SITE}:${JOB_IDENT}]]]" cat "${F}" echo fi done fi # Collect states of configured CMA backup jobs if inpath mkbackup && ls /var/lib/mkbackup/*.state >/dev/null 2>&1; then echo "<<>>" for F in /var/lib/mkbackup/*.state; do JOB_IDENT=${F%.state} JOB_IDENT=${JOB_IDENT##*/} if [ "${JOB_IDENT}" != "restore" ]; then echo "[[[system:${JOB_IDENT}]]]" cat "${F}" echo fi done fi } section_thermal() { if [ -z "${IS_DOCKERIZED}" ] && [ -z "${IS_LXC_CONTAINER}" ] && ls /sys/class/thermal/thermal_zone* >/dev/null 2>&1; then echo '<<>>' for F in /sys/class/thermal/thermal_zone*; do line="${F##*/}" if [ ! -e "${F}/mode" ]; then line="${line}|-" else line="${line}|$(cat "${F}"/mode)" fi line="${line}|$(cat "${F}/type")|$(cat "${F}/temp")" for G in "${F}"/trip_point_*_temp; do line="${line}|$(cat "$G")|$(cat "${G/%temp/type}")" done echo "$line" done fi } section_libelle() { if inpath trd; then echo "<<>>" trd -s fi } section_http_accelerator() { if inpath varnishstat; then echo "<<>>" varnishstat -1 fi } section_proxmox() { if inpath pvecm; then echo "<<>>" pvecm status echo "<<>>" pvecm nodes fi } section_haproxy() { for HAPROXY_SOCK in /run/haproxy/admin.sock /var/lib/haproxy/stats; do if [ -r "${HAPROXY_SOCK}" ] && inpath socat; then echo "<<>>" echo "show stat" | socat - "UNIX-CONNECT:${HAPROXY_SOCK}" fi done } # # BEGIN COMMON AGENT CODE # section_job() { # Get statistics about monitored jobs. _cat_files() { # read file names from stdin and write like `head -n -0 -v file` while read -r file; do printf "==> %s <==\n" "${file##./}" cat "${file}" done } ( cd "${MK_VARDIR}/job" 2>/dev/null || return printf "<<>>\n" for user in *; do ( cd "${user}" 2>/dev/null || return # return from subshell only # This folder is owned (and thus writable) by the user that ran the jobs. # The agent (root) must not read files that are not owned by the user. # This prevents symlink or hardlink attacks. find -L . -type f -user "${user}" | _cat_files ) done ) } section_fileinfo() { # fileinfo check: put patterns for files into /etc/check_mk/fileinfo.cfg perl -e ' use File::Glob "bsd_glob"; my @patterns = (); foreach (bsd_glob("$ARGV[0]/fileinfo.cfg"), bsd_glob("$ARGV[0]/fileinfo.d/*")) { open my $handle, "<", $_ or next; while (<$handle>) { chomp; next if /^\s*(#|$)/; my $pattern = $_; $pattern =~ s/\$DATE:(.*?)\$/substr(`date +"$1"`, 0, -1)/eg; push @patterns, $pattern; } warn "error while reading $_: $!\n" if $!; close $handle; } exit if ! @patterns; my $file_stats = ""; foreach (@patterns) { foreach (bsd_glob("$_")) { if (! -f) { $file_stats .= "$_|missing\n" if ! -d; } elsif (my @infos = stat) { $file_stats .= "$_|ok|$infos[7]|$infos[9]\n"; } else { $file_stats .= "$_|stat failed: $!\n"; } } } print "<<>>\n", time, "\n[[[header]]]\nname|status|size|time\n[[[content]]]\n$file_stats"; ' -- "${MK_CONFDIR}" } # # END COMMON AGENT CODE # # ntpq helper function get_ntpq() { inpath ntpq || return 1 _run_cached_internal "ntp" 30 120 200 20 "echo '<<>>'; waitmax 5 ntpq -np | sed -e 1,2d -e 's/^\(.\)/\1 /' -e 's/^ /%/' || true" } section_timesyncd() { if [ -n "${IS_DOCKERIZED}" ] || [ -n "${IS_LXC_CONTAINER}" ]; then return 0 fi inpath systemctl || return 1 inpath timedatectl || return 1 systemctl is-enabled systemd-timesyncd.service >/dev/null 2>&1 || return 1 # debian 10.8 uses ConditionFileIsExecutable to "disable" systemd-timedatectl when ntp is installed. # The service is still enabled, but does not start timesyncd as the condition is not met. (inpath ntpd || inpath openntpd || inpath chronyd || inpath VBoxService) && return 1 # we check the same condition as the systemd condition timedatectl timesync-status >/dev/null 2>&1 || return 1 ${MK_RUN_SYNC_PARTS} || return 0 echo "<<>>" timedatectl timesync-status # /run/systemd/timesync/synchronized is a more reliable/the correct file to look at for > systemd v250 if [ -f "/run/systemd/timesync/synchronized" ]; then get_file_mtime /run/systemd/timesync/synchronized | awk '{print "[[["$1"]]]"}' else get_file_mtime /var/lib/systemd/timesync/clock | awk '{print "[[["$1"]]]"}' fi echo "<<>>" timedatectl show-timesync | awk '/NTPMessage/{print $0}' timedatectl show | awk '/Timezone/{print $0}' return 0 # intended not to execute section_ntp even in the case where get_file_mtime fails } section_ntp() { if [ -n "${IS_DOCKERIZED}" ] || [ -n "${IS_LXC_CONTAINER}" ]; then return 0 fi # First we try to identify if we're beholden to systemd if inpath systemctl; then # shellcheck disable=SC2016 if [ "$(systemctl | awk '/ntp.service|ntpd.service|ntpsec.service/{print $3; exit}')" = "active" ]; then # remove heading, make first column space separated get_ntpq return fi fi # If we get to this point, we attempt via classic ntp daemons (ntpq required) if inpath ntpq; then # Try to determine status via /etc/init.d # This might also be appropriate for AIX, Solaris and others for _ntp_daemon in ntp ntpd openntpd; do # Check for a service script if [ -x /etc/init.d/"${_ntp_daemon}" ]; then # If the status returns 0, we assume we have a running service if /etc/init.d/"${_ntp_daemon}" status >/dev/null 2>&1; then get_ntpq return fi fi done unset -v _ntp_daemon # For other systems such as Slackware if [ -x "/etc/rc.d/rc.ntpd" ]; then get_ntpq return fi fi } run_real_time_checks() { RTC_PLUGINS="" # shellcheck source=agents/cfg_examples/real_time_checks.cfg . "${MK_CONFDIR}/real_time_checks.cfg" 2>/dev/null || return if [ -z "${RTC_SECRET}" ]; then define_optionally_encrypt "no" else inpath openssl || { echo "ERROR: openssl command is missing, but encryption is requested. Not sending real-time data." >&2 return } define_optionally_encrypt "yes" fi for trigger in "${MK_VARDIR}/rtc_remotes/"?*; do # no such file => no expansion of ?* => nothing to do # braces are needed so run_real_time_checks_for_remote can be forked away # otherwise async execution of check plugins is held off by activated realtime checks. [ -e "${trigger}" ] && { run_real_time_checks_for_remote "${trigger}" "${RTC_SECRET}" >/dev/null & } done } _rt_pidfile_is_mine() { [ "$(cat "${1}" 2>/dev/null)" = "$$" ] } _rt_pidfile_is_alive() { [ "$(("$(get_epoch)" - "$(get_file_atime "${1}")"))" -le "${RTC_TIMEOUT}" ] } _rt_timestamp() { get_epoch | tr -d '\n' } _rt_sendudp() { # concatenate the output of all commands to a single udp packet dd bs=9999 iflag=fullblock 2>/dev/null >"/dev/udp/${1}/${2}" } # Implements Real-Time Check feature of the Checkmk agent which can send # some section data in 1 second resolution. Useful for fast notifications and # detailed graphing (if you configure your RRDs to this resolution). # 2 bytes: protocol version # 10 bytes: timestamp # rest: encrypted data # Be aware of maximum packet size. run_real_time_checks_for_remote() { pidfile="${1}" secret="${2}" remote="${pidfile##*/rtc_remotes/}" # have I already started for this remote? _rt_pidfile_is_mine "${pidfile}" && return echo $$ >"${pidfile}" while true; do _rt_pidfile_is_mine "${pidfile}" || return _rt_pidfile_is_alive "${pidfile}" || { rm "${pidfile}" return } for section in ${RTC_SECTIONS}; do section_"${section}" | optionally_encrypt "${secret}" "$(_rt_timestamp)" | _rt_sendudp "${remote}" "${RTC_PORT}" done # Plugins cd "${PLUGINSDIR}" || continue for script in ${RTC_PLUGINS}; do is_valid_plugin "${script}" || continue plugin_interpreter=$(get_plugin_interpreter "${script}") || continue "${plugin_interpreter}" "${script}" | optionally_encrypt "${secret}" "$(_rt_timestamp)" | _rt_sendudp "${remote}" "${RTC_PORT}" done sleep 1 done } # # BEGIN COMMON AGENT CODE # run_cached() { # Compatibility wrapper for plugins that might use run_cached. # We should have never exposed this as quasi API. NAME="${1}" MAXAGE="${2}" REFRESH_INTERVAL="${3}" shift 3 OUTPUT_TIMEOUT=$((MAXAGE * 3)) CREATION_TIMEOUT=$((MAXAGE * 2)) _run_cached_internal "${NAME}" "${REFRESH_INTERVAL}" "${MAXAGE}" "${OUTPUT_TIMEOUT}" "${CREATION_TIMEOUT}" "$@" } _run_cached_internal() { # Run a command asynchronous by use of a cache file. # Usage: _run_cached_internal NAME REFRESH_INTERVAL MAXAGE OUTPUT_TIMEOUT OUTPUT_TIMEOUT CREATION_TIMEOUT [COMMAND ...] # Note that while multiple COMMAND arguments are considered, they are evaluated in a string. # This means that extra escaping is required. # For example: # To run a cat command every two minutes, considering the created data valid for one three minutes, # send the created data for four minutes and allowing the command to run for 12 minutes, you'll have to call # # _run_cached_interal "my_file_content" 120 180 240 720 "cat \"My File\"" # # Mind the escaping... NAME="${1}" # name of the section (also used as cache file name) REFRESH_INTERVAL="${2}" # threshold in seconds when the cache file needs to be regenerated MAXAGE="${3}" # maximum cache livetime in seconds OUTPUT_TIMEOUT="${4}" # threshold in seconds for how long the cache file will be output (regardless of whether it is outdated) CREATION_TIMEOUT="${5}" # threshold in seconds for how long the process is allowed to be running before it is killed (see below for details) shift 5 # $* is now the command to run if ${DISABLE_CACHING:-false}; then # We need the re-splitting to be compatible with the caching case, so: # shellcheck disable=SC2068 $@ return fi [ -d "${MK_VARDIR}/cache" ] || mkdir -p "${MK_VARDIR}/cache" CACHEFILE="${MK_VARDIR}/cache/${NAME}.cache" FAIL_REPORT_FILE="${SPOOLDIR}/${NAME}.cachefail" NOW="$(get_epoch)" MTIME="$(get_file_mtime "${CACHEFILE}" 2>/dev/null)" || MTIME=0 if ${MK_RUN_SYNC_PARTS}; then if [ -s "${CACHEFILE}" ] && [ $((NOW - MTIME)) -le "${OUTPUT_TIMEOUT}" ]; then # Output the file (if it is not too outdated) CACHE_INFO="cached(${MTIME},${MAXAGE})" # prefix or insert cache info, unless already present. # WATCH OUT: AIX does not allow us to pass this as a single '-e' option! if [ "${NAME%%_*}" = "local" ] || [ "${NAME%%_*}" = "mrpe" ]; then sed -e '/^<<<.*>>>/{p;d;}' -e '/^cached([0-9]*,[0-9]*) /{p;d;}' -e "s/^/${CACHE_INFO} /" "${CACHEFILE}" else sed -e '/^<<<.*\(:cached(\).*>>>/{p;d;}' -e 's/^<<<\([^>]*\)>>>$/<<<\1:'"${CACHE_INFO}"'>>>/' "${CACHEFILE}" fi fi fi if ${MK_RUN_ASYNC_PARTS}; then # Kill the process if it is running too long (cache file not accessed for more than CREATION_TIMEOUT seconds). # If killing succeeds, remove CACHFILE.new.PID. # Write info about the timed out process and the kill attempt to the SPOOLDIR. # It will be reported to the server in the next (synchronous) agent execution. # The file will be deleted as soon as the plugin/local check is functional again. # Do not output the file here, it will interrupt the local and mrpe sections, as well as any other # partially cached section. for cfile in "${CACHEFILE}.new."*; do [ -e "${cfile}" ] || break # no match TRYING_SINCE="$(get_file_atime "${cfile}")" [ -n "${TRYING_SINCE}" ] || break # race condition: file vanished if [ $((NOW - TRYING_SINCE)) -ge "${CREATION_TIMEOUT}" ]; then { printf "<<>>\n" pid="${cfile##*.new.}" printf "timeout|%s|%s|%s\n" "${NAME}" "${CREATION_TIMEOUT}" "${pid}" kill -9 "${pid}" >/dev/null 2>&1 && sleep 2 # TODO: what about child processes? if [ -n "$(ps -o args= -p "${pid}")" ]; then printf "killfailed|%s|%s|%s\n" "${NAME}" "${CREATION_TIMEOUT}" "${pid}" else rm -f "${cfile}" fi } >"${FAIL_REPORT_FILE}" 2>&1 fi done # This does the right thing, regardless whether the pattern matches! _cfile_in_use() { for cfile in "${CACHEFILE}.new."*; do printf "%s\n" "${cfile}" break done } # Time to refresh cache file and new job not yet running? if [ $((NOW - MTIME)) -gt "${REFRESH_INTERVAL}" ] && [ ! -e "$(_cfile_in_use)" ]; then # Start it. If the command fails the output is thrown away cat </dev/null 2>&1 & eval '${MK_DEFINE_LOG_SECTION_TIME}' exec > "${CACHEFILE}.new.\$\$" || exit 1 $* && mv -f "${CACHEFILE}.new.\$\$" "${CACHEFILE}" && rm -f "${FAIL_REPORT_FILE}" || rm -f "${CACHEFILE}.new.\$\$" HERE fi fi unset NAME MAXAGE CREATION_TIMEOUT REFRESH_INTERVAL CACHEFILE NOW MTIME CACHE_INFO TRYING_SINCE OUTPUT_TIMEOUT } run_local_checks() { cd "${LOCALDIR}" || return if ${MK_RUN_SYNC_PARTS}; then echo '<<>>' for script in ./*; do if is_valid_plugin "${script}"; then _log_section_time "${script}" fi done fi # Call some local checks only every X'th second for script in [1-9]*/*; do if is_valid_plugin "${script}"; then interval="${script%/*}" _run_cached_internal "local_${script##*/}" "${interval}" "${interval}" $((interval * 3)) $((interval * 2)) "_log_section_time '${script}'" fi done } run_spooler() { ( cd "${SPOOLDIR}" 2>/dev/null || return now=$(get_epoch) for file in *; do [ "${file}" != "*" ] || return # If prefixed with a number, then that is the maximum age in seconds. # If the file is older than that, it is ignored. maxage="${file%%[^0-9]*}" if [ "${maxage}" ]; then mtime=$(get_file_mtime "${file}") [ $((now - mtime)) -le "${maxage}" ] || continue fi cat "${file}" done ) } get_plugin_interpreter() { # Return the interpreter (or "") for the plugin file (or fail). # We return the interpreter instead of wrapping the call, so we don't # have to export the function (which is not portable). # normalize input agent_plugin="${1#./}" extension="${agent_plugin##*.}" filename="${agent_plugin%.*}" # Execute all non python plugins with ./foo if [ "${extension}" != "py" ]; then return 0 fi if [ "${filename#"${filename%??}"}" != "_2" ]; then if [ -n "${NO_PYTHON}" ] || [ -n "${WRONG_PYTHON_COMMAND}" ]; then section_checkmk_failed_plugin "${agent_plugin}" return 1 fi if [ -n "${PYTHON3}" ]; then echo "${PYTHON3}" return 0 fi if [ ! -e "${filename}_2.py" ]; then section_checkmk_failed_plugin "${agent_plugin} (Missing Python 3 installation)" return 1 fi # no python3 found, but python2 plugin file present return 1 fi if [ -x "${filename%??}.py" ] && [ -n "${PYTHON3}" ]; then return 1 fi if [ -n "${PYTHON2}" ]; then echo "${PYTHON2}" return 0 fi section_checkmk_failed_plugin "${agent_plugin} (missing Python 2 installation)" return 1 } run_plugins() { cd "${PLUGINSDIR}" || return if ${MK_RUN_SYNC_PARTS}; then for script in ./*; do if is_valid_plugin "${script}"; then if plugin_interpreter=$(get_plugin_interpreter "${script}"); then # SC2086: We don't want to quote, interpreter is "nothing" if empty, not "''" # shellcheck disable=SC2086 _log_section_time ${plugin_interpreter} "${script}" fi fi done fi # Call some plugins only every X'th second for script in [1-9]*/*; do if is_valid_plugin "${script}"; then if plugin_interpreter=$(get_plugin_interpreter "${script}"); then interval="${script%/*}" # shellcheck disable=SC2086 _run_cached_internal "plugins_${script##*/}" "${interval}" "${interval}" $((interval * 3)) $((interval * 2)) _log_section_time ${plugin_interpreter} "${script}" fi fi done } _non_comment_lines() { grep -Ev '^[[:space:]]*($|#)' "${1}" } _mrpe_get_interval() { echo "${1}" | grep -E '^\([^)]*\)' | sed -n 's/^.*interval=\([^:)]*\).*$/\1/p' } _mrpe_normalize_spaces() { # watch out: # * [:blank:] does not include \t on AIX # * [:space:] does include \n on Linux tr -s '\t' ' ' } run_remote_plugins() { configfile="${1}" prefix="${2}" [ -f "${configfile}" ] || return _non_comment_lines "${configfile}" | _mrpe_normalize_spaces | while read -r descr rest; do interval="$(_mrpe_get_interval "${rest}")" cmdline="${rest#\(*\) }" if [ -n "${prefix}" ]; then cmdline="${prefix} '${cmdline}'" fi if [ -z "${interval}" ]; then ${MK_RUN_SYNC_PARTS} && run_mrpe "${descr}" "${cmdline}" else # Sourcing the agent here is not very performant, but we need 'run_mrpe', and not all shells support exporting of functions. _run_cached_internal "mrpe_${descr}" "${interval}" "${interval}" $((interval * 3)) $((interval * 2)) "MK_SOURCE_AGENT=yes . '${0}'; run_mrpe \"${descr}\" \"${cmdline}\"" fi done } run_mrpe() { descr="${1}" shift PLUGIN="${1%% *}" OUTPUT="$(eval "${MK_DEFINE_LOG_SECTION_TIME}; _log_section_time $*")" STATUS="$?" printf "<<>>\n" printf "(%s) %s %s %s" "${PLUGIN##*/}" "${descr}" "${STATUS}" "${OUTPUT}" | tr \\n \\1 printf "\n" unset descr PLUGIN OUTPUT STATUS } # # END COMMON AGENT CODE # run_runas_executor() { [ -f "${MK_CONFDIR}/runas.cfg" ] || return _non_comment_lines "${MK_CONFDIR}/runas.cfg" | while read -r type user configfile; do prefix="" if [ "${user}" != "-" ]; then prefix="su ${user} -c" fi # mrpe includes if [ "${type}" = "mrpe" ]; then run_remote_plugins "${configfile}" "${prefix}" # local and plugin includes elif [ "${type}" = "local" ] || [ "${type}" = "plugin" ]; then if ${MK_RUN_SYNC_PARTS}; then if [ "${type}" = "local" ]; then echo "<<>>" fi find "${configfile}" -executable -type f | while read -r filename; do if [ -n "${prefix}" ]; then # SC2086: We don't want to quote since prefix should not be treated as a # single string but as multiple arguments passed to _log_section_time # shellcheck disable=SC2086 _log_section_time ${prefix} "${filename}" else _log_section_time "${filename}" fi done fi fi done } run_purely_synchronous_sections() { _log_section_time section_checkmk _log_section_time section_cmk_agent_ctl_status [ -z "${MK_SKIP_CHECKMK_AGENT_PLUGINS}" ] && _log_section_time section_checkmk_agent_plugins [ -z "${MK_SKIP_LABELS}" ] && _log_section_time section_labels [ -z "${MK_SKIP_DF}" ] && _log_section_time section_df [ -z "${MK_SKIP_SYSTEMD}" ] && _log_section_time section_systemd # Filesystem usage for ZFS [ -z "${MK_SKIP_ZFS}" ] && _log_section_time section_zfs # Check NFS mounts by accessing them with stat -f (System # call statfs()). If this lasts more then 2 seconds we # consider it as hanging. We need waitmax. [ -z "${MK_SKIP_NFS_MOUNTS}" ] && _log_section_time section_nfs_mounts # Check mount options. Filesystems may switch to 'ro' in case # of a read error. [ -z "${MK_SKIP_MOUNTS}" ] && _log_section_time section_mounts [ -z "${MK_SKIP_PS}" ] && _log_section_time section_ps # Memory usage [ -z "${MK_SKIP_MEM}" ] && _log_section_time section_mem # Load and number of processes [ -z "${MK_SKIP_CPU}" ] && _log_section_time section_cpu # Uptime [ -z "${MK_SKIP_UPTIME}" ] && _log_section_time section_uptime # New variant: Information about speed and state in one section [ -z "${MK_SKIP_LNX_IF}" ] && _log_section_time section_lnx_if # Current state of bonding interfaces [ -z "${MK_SKIP_BONDING_IF}" ] && _log_section_time section_bonding_interfaces # Same for Open vSwitch bonding [ -z "${MK_SKIP_VSWITCH_BONDING}" ] && _log_section_time section_vswitch_bonding # Number of TCP connections in the various states [ -z "${MK_SKIP_TCP}" ] && _log_section_time section_tcp # Linux Multipathing [ -z "${MK_SKIP_MULTIPATHING}" ] && _log_section_time section_multipathing # Performancecounter Platten [ -z "${MK_SKIP_DISKSTAT}" ] && _log_section_time section_diskstat # Performancecounter Kernel [ -z "${MK_SKIP_KERNEL}" ] && _log_section_time section_kernel # RAID status of Linux software RAID [ -z "${MK_SKIP_MD}" ] && _log_section_time section_md # RAID status of Linux RAID via device mapper [ -z "${MK_SKIP_DM_RAID}" ] && _log_section_time section_dm_raid # RAID status of LSI controllers via cfggen [ -z "${MK_SKIP_CFGGEN}" ] && _log_section_time section_cfggen # RAID status of LSI MegaRAID controller via StorCLI or MegaCli. You can download that tool from: # https://docs.broadcom.com/docs/007.2007.0000.0000_Unified_StorCLI.zip [ -z "${MK_SKIP_MEGARAID}" ] && _log_section_time section_megaraid # RAID status of 3WARE disk controller (by Radoslaw Bak) [ -z "${MK_SKIP_THREE_WARE_RAID}" ] && _log_section_time section_3ware_raid # VirtualBox Guests. Section must always been output. Otherwise the # check would not be executed in case no guest additions are installed. # And that is something the check wants to detect [ -z "${MK_SKIP_VBOX_GUEST}" ] && _log_section_time section_vbox_guest # OpenVPN Clients. Currently we assume that the configuration # is in # /etc/openvpn. We might find a safer way to find the configuration later. [ -z "${MK_SKIP_OPENVPN}" ] && _log_section_time section_openvpn [ -z "${MK_SKIP_NVIDIA}" ] && _log_section_time section_nvidia [ -z "${MK_SKIP_DRBD}" ] && _log_section_time section_drbd # Heartbeat monitoring # Different handling for heartbeat clusters with and without CRM # for the resource state [ -z "${MK_SKIP_HEARTBEAT}" ] && _log_section_time section_heartbeat [ -z "${MK_SKIP_MAILQUEUE}" ] && _log_section_time section_mailqueue ## Check status of OMD sites and Checkmk Notification spooler # Welcome the ZFS check on Linux # We do not endorse running ZFS on linux if your vendor doesnt support it ;) # check zpool status [ -z "${MK_SKIP_ZPOOL}" ] && _log_section_time section_zpool # Veritas Cluster Server # Software is always installed in /opt/VRTSvcs. # Secure mode must be off to allow root to execute commands [ -z "${MK_SKIP_VERITAS}" ] && _log_section_time section_veritas_cluster ## Fileinfo-Check: put patterns for files into /etc/check_mk/fileinfo.cfg [ -z "${MK_SKIP_FILEINFO}" ] && _log_section_time section_fileinfo # Get stats about OMD monitoring cores running on this machine. # Since cd is a shell builtin the check does not affect the performance # on non-OMD machines. [ -z "${MK_SKIP_OMD_CORES}" ] && _log_section_time section_omd_core # Collect states of configured Checkmk site backup jobs _log_section_time section_mkbackup # Get statistics about monitored jobs. Below the job directory there # is a sub directory per user that ran a job. That directory must be # owned by the user so that a symlink or hardlink attack for reading # arbitrary files can be avoided. [ -z "${MK_SKIP_JOB}" ] && _log_section_time section_job # Gather thermal information provided e.g. by acpi # At the moment only supporting thermal sensors [ -z "${MK_SKIP_THERMAL}" ] && _log_section_time section_thermal # Libelle Business Shadow [ -z "${MK_SKIP_LIBELLE}" ] && _log_section_time section_libelle # HTTP Accelerator Statistics [ -z "${MK_SKIP_HTTP_ACCELERATOR}" ] && _log_section_time section_http_accelerator # Proxmox Cluster [ -z "${MK_SKIP_PROXMOX}" ] && _log_section_time section_proxmox [ -z "${MK_SKIP_HAPROXY}" ] && _log_section_time section_haproxy } run_partially_asynchronous_sections() { # Time synchronization with Chrony [ -z "${MK_SKIP_CHRONY}" ] && _log_section_time section_chrony # Hardware sensors via IPMI (need ipmitool) [ -z "${MK_SKIP_IPMITOOL}" ] && _log_section_time section_ipmitool # IPMI data via ipmi-sensors (of freeipmi). Please make sure, that if you # have installed freeipmi that IPMI is really support by your hardware. [ -z "${MK_SKIP_IPMISENSORS}" ] && _log_section_time section_ipmisensors # RAID controllers from areca (Taiwan) # cli64 can be found at ftp://ftp.areca.com.tw/RaidCards/AP_Drivers/Linux/CLI/ [ -z "${MK_SKIP_ARECA}" ] && _log_section_time section_areca_raid ## Check status of OMD sites and Checkmk Notification spooler [ -z "${MK_SKIP_OMD}" ] && _log_section_time section_omd if [ -z "${MK_SKIP_TIMESYNCHRONISATION}" ]; then _log_section_time section_timesyncd || _log_section_time section_ntp fi } main_setup() { set_up_remote # close stdin exec /dev/null fi set_up_get_epoch set_up_current_shell set_variable_defaults announce_remote # needs MK_VARDIR! PATH="$(set_up_path "${PATH}")" set_up_profiling unset_locale detect_python detect_container_environment set_up_encryption set_up_disabled_sections export_utility_functions } main_sync_parts() { run_purely_synchronous_sections _log_section_time run_spooler } main_mixed_parts() { run_partially_asynchronous_sections run_remote_plugins "${MK_CONFDIR}/mrpe.cfg" "" run_runas_executor run_local_checks run_plugins } main_async_parts() { # Start new liveupdate process in background # Starting a new live update process will terminate the old one automatically after # max. 1 sec. run_real_time_checks } main_finalize_sync() { finalize_profiling } # # BEGIN COMMON AGENT CODE # main() { while true; do main_setup "$@" ( ${MK_RUN_SYNC_PARTS} && main_sync_parts (${MK_RUN_ASYNC_PARTS} || ${MK_RUN_SYNC_PARTS}) && main_mixed_parts ${MK_RUN_ASYNC_PARTS} && main_async_parts ${MK_RUN_SYNC_PARTS} && main_finalize_sync ) | { if ${MK_RUN_SYNC_PARTS}; then optionally_encrypt "${PASSPHRASE}" ""; else cat; fi; } [ "${MK_LOOP_INTERVAL}" -gt 0 ] 2>/dev/null || return 0 sleep "${MK_LOOP_INTERVAL}" done } [ -z "${MK_SOURCE_AGENT}" ] && main "$@"