#!/usr/bin/env python3 """ Nagios plugin to check Ceph RBD snapshot-based mirroring health. The script connects to a Ceph cluster and inspects all pools with RBD mirroring enabled in IMAGE mode. For each mirrored image, it evaluates the mirror state (replaying or stopped) and, when stopped, parses peer status metadata to detect stale or desynchronized snapshots based on a configurable age cut-off. The plugin reports OK when all images are synchronized, or CRITICAL when unsynchronized, stale, or misconfigured images are detected. An optional YAML configuration file can be used to exclude images: - poolExclude: list of pool names to skip. - excludeRegex: list of regular expressions matched against "pool/image". Performance data emitted by the plugin includes: - Number of mirrored images currently synchronized and up - Number of images in replaying state - Number of images in stopped state - Number of images excluded by configuration - Number of excluded images that are currently up - Number of images with errors (unsynchronized, stale, or misconfigured) - Total number of images checked (excluding excluded images) When invoked with the -m option, the plugin prints the rbd commands required to enable snapshot-based mirroring for images where mirroring is currently disabled. Required Ceph permissions for the client used (e.g., [client.nagios]): - mon = "allow r" - osd = "allow r, allow "class-read" Use the -h option to display all available command-line parameters. """ __author__ = 'Juan Ferrer Toribio ' __copyright__ = 'Copyright (c) 2025 Verdnatura Levante SL' __license__ = 'GPLv3' __version__ = '1.0.1' import rados import rbd import json import time import sys import argparse import re import yaml import os from enum import IntEnum class Status(IntEnum): OK = 0 WARNING = 1 CRITICAL = 2 UNKNOWN = 3 class PoolModes(IntEnum): NONE = 0 IMAGE = 1 class ImageState(IntEnum): UP_REPLAYING = 4 UP_STOPPED = 6 # Parse arguments parser = argparse.ArgumentParser( description="Ceph RBD mirror health Nagios plugin." ) parser.add_argument('-c', '--conf', help="Alternative ceph conf file (default: %(default)s)", default='/etc/ceph/ceph.conf', ) parser.add_argument('-k', '--keyring', help="Ceph client keyring file", ) parser.add_argument('-n', '--name', help="Ceph client name", ) parser.add_argument('-i', '--id', help="Ceph client name (default: %(default)s)", default='admin', ) parser.add_argument('-t', '--cutoff', help="Snapshot age cut-off seconds (default: %(default)s)", default=129600, # 36 hours type=int, ) parser.add_argument('-a', '--age', help="Exclude mirror disabled images newer than age (default: %(default)s)", default=10800, # 3 hours type=int, ) parser.add_argument('-w', '--warn-excluded', help="Show warning when excluded image is up", action='store_true', ) parser.add_argument('-s', '--show-excluded', help="Show excluded image statuses", action='store_true', ) parser.add_argument('-m', '--commands', help="Display commands to enable mirroring", action='store_true', ) parser.add_argument('-f', '--cconf', help="Check configuration file (default: %(default)s)", default='/etc/nagios/check_rbd_mirror.yaml', ) args = parser.parse_args() # Fetch configuration conf = { 'poolExclude': [], 'excludeRegex': [], } if os.path.isfile(args.cconf): with open(args.cconf) as f: fileConf = yaml.safe_load(f) or {} conf = {**conf, **fileConf} pool_exclude = set(conf['poolExclude'] or []) if conf['excludeRegex']: exclude_regex = [] for regex in conf['excludeRegex']: exclude_regex.append(re.compile(regex)) else: exclude_regex = None now = int(time.time()) snap_cutoff = now - args.cutoff # Connect to Ceph cluster cluster = None ioctx = None try: keyring = args.keyring if not keyring: keyring = f'/etc/ceph/ceph.client.{args.id}.keyring' name = args.name if not name: name = f'client.{args.id}' cluster = rados.Rados( conffile=args.conf, name=name, conf=dict( keyring=keyring, ) ) cluster.connect() except Exception as e: print(f"CRITICAL - Cannot connect to Ceph cluster: {e}") sys.exit(Status.CRITICAL) # Iterate over snapshot mirrored pools try: n_up = 0 n_excluded_up = 0 n_stopped = 0 n_replaying = 0 n_total = 0 img_check_statuses = {} img_errors = set() img_excluded = set() for pool in cluster.list_pools(): if pool in pool_exclude: continue try: ioctx = cluster.open_ioctx(pool) rbd_inst = rbd.RBD() images = rbd_inst.list(ioctx) except Exception as e: print(f"CRITICAL - Error opening pool {pool}: {e}") sys.exit(Status.CRITICAL) mirror_mode = rbd_inst.mirror_mode_get(ioctx) if mirror_mode != PoolModes.IMAGE: ioctx.close() continue statuses = rbd_inst.mirror_image_status_list(ioctx) img_statuses = {status['name']: status for status in statuses} # Iterate over images for img_name in images: img_full_name = f'{pool}/{img_name}' excluded = False if exclude_regex: for regex in exclude_regex: if regex.match(img_full_name): excluded = True break img_is_up = False img_check_status = '' img_status = img_statuses.get(img_name) if img_status: state = img_status.get('state') if state == ImageState.UP_STOPPED: peers = img_status.get('remote_statuses', []) if peers and len(peers) > 0: description = peers[0].get('description', '') desc_json_str = description.split(',', 1)[-1].strip() try: desc_json = json.loads(desc_json_str) local_stamp = desc_json.get('local_snapshot_timestamp', 0) remote_stamp = desc_json.get('remote_snapshot_timestamp', 0) if remote_stamp < snap_cutoff: img_check_status = 'snap+desync' elif local_stamp < snap_cutoff: img_check_status = 'snap+old' else: img_check_status = 'up+stopped' img_is_up = True except json.JSONDecodeError: img_check_status = 'json+invalid' else: img_check_status = 'peer+empty' elif state == ImageState.UP_REPLAYING: img_check_status = 'up+replaying' img_is_up = True else: img_check_status = 'state+unknown' else: if not excluded: img_info = rbd.Image(ioctx, img_name, read_only=True) try: img_created = int(img_info.create_timestamp().timestamp()) if img_created >= now - args.age: excluded = True finally: img_info.close() img_check_status = 'mirror+disabled' if not excluded: n_total += 1 if excluded: img_excluded.add(img_full_name) if img_is_up: n_excluded_up += 1 elif not img_is_up: img_errors.add(img_full_name) else: n_up += 1 match state: case ImageState.UP_STOPPED: n_stopped += 1 case ImageState.UP_REPLAYING: n_replaying += 1 img_check_statuses[img_full_name] = img_check_status ioctx.close() ioctx = None finally: if ioctx: ioctx.close() cluster.shutdown() # Output results if args.commands: for img in sorted(img_errors): if img_check_statuses[img] == 'mirror+disabled': print(f'rbd mirror image enable "{img}" snapshot') sys.exit(0) n_errors = len(img_errors) n_excluded = len(img_excluded) perf_data = ( f'n_up={n_up} n_replaying={n_replaying} n_stopped={n_stopped} ' f'n_excluded={n_excluded} n_excluded_up={n_excluded_up}' ) if n_errors: perf_data = f'n_errors={n_errors} n_total={n_total} {perf_data}' status_text = f"{n_errors} of {n_total} images unsynchronized" exit_status = Status.CRITICAL else: status_text = f"All {n_up} images up" if n_excluded_up: status_text = f"{status_text}, {n_excluded_up} excluded but up" if n_excluded_up and args.warn_excluded: exit_status = Status.WARNING else: exit_status = Status.OK print(f"{Status(exit_status).name} - {status_text} | {perf_data}") if n_errors: print('errors:') for img in sorted(img_errors): print(f"- {img} => {img_check_statuses[img]}") if n_excluded and (args.show_excluded or args.warn_excluded): print('excluded:') for img in sorted(img_excluded): print(f"- {img} => {img_check_statuses[img]}") sys.exit(exit_status)