#!/usr/bin/env python ''' This Python Nagios plugin checks for any failed VM backups on SimpliVity OmniCube. There's also a second mode in this plugin to display all VMs with a specific backup policiy. Python 2 is required with use of the libraries sys, os, optparse, time, datetime, pxssh Normally you just need to install "sudo yum install pexpect.noarch" or "sudo apt-get install python-pexpect" Copyright (c) 2015 www.usolved.net Published under https://github.com/usolved/check_usolved_omnicube_backup This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ------------------------ v1.4 The XML output has an extra line when too much results are found with the svt-backup-show. Added --max-results parameter to fix this. v1.3 Added enhanced argument status:notstarted to check for not started backups (OmniCube command svt-backup-show just includes started backups) Bugfix for parameter -D. The --until argument is now being used for the OmniCube command Timeout parameter is now also being used for the OmniCube commands and not just the SSH connect v1.2 Changed policy check that you can list hosts with a specific policy name v1.1 Bugfix for the backup status. When the retried backup succeeded you'll get an OK status and an extended info which hosts needed more than one try v1.0 Initial release ''' import sys import os import optparse import time import datetime import pxssh try: import xml.etree.ElementTree as et except ImportError: import elementtree.ElementTree as et ###################################################################### # Definitions of variables #max results for svt-backup-show command max_results = 10000 # Arrays for return codes and return message return_code = { 'OK': 0, 'WARNING': 1, 'CRITICAL': 2, 'UNKNOWN': 3 } return_msg = '' return_perfdata = '' hosts_excluded = '' ###################################################################### # Parse Arguments parser = optparse.OptionParser() parser.add_option('-H', '--hostname', help='IP or hostname of the OmniCube host', dest='arg_hostname', type='string') parser.add_option('-U', '--username', help='OmniCube SSH username (remember to escape the backslash in the username. For example domain\myuser would be "domain\\\\\myuser" or domain\\\\\\\myuser as argument)', dest='arg_username', type='string') parser.add_option('-P', '--password', help='OmniCube SSH password', dest='arg_password', type='string') parser.add_option('-M', '--mode', help='Plugins mode (-M status (to check if all backups were successful), -M status:notstarted (to check if all backups were successful including not started backups) or -M policy (to check if all VMs have policies assigned))', dest='arg_mode', type='string') parser.add_option('-N', '--policyname', help='Backup policy name', dest='arg_policyname', type='string') parser.add_option('-E', '--exclude', help='Exclude comma separated hosts for policy check', dest='arg_exclude', type='string', default='') parser.add_option('-D', '--backupdate', help='Without an argument the backups status check gets the status from yesterday. If you wish to check for other days, give the argument -D YYYY-MM-DD', dest='arg_backupdate', type='string', default='yesterday') parser.add_option('-T', '--timeout', help='SSH and OmniCube command timeout in seconds', dest='arg_timeout', type='int', default=45) (opts, args) = parser.parse_args() arg_hostname = opts.arg_hostname arg_username = opts.arg_username arg_password = opts.arg_password arg_mode = opts.arg_mode arg_policyname = opts.arg_policyname arg_exclude = opts.arg_exclude arg_backupdate = opts.arg_backupdate arg_timeout = opts.arg_timeout if arg_exclude != "": hosts_excluded = arg_exclude.split(',') ###################################################################### # Functions def output_nagios(return_msg, return_perfdata, return_code): print return_msg sys.exit(return_code) #-------------------------------------------------------------------- def ssh_connect(hostname, username, password, timeout): global return_msg try: ssh = pxssh.pxssh(timeout=timeout) ssh.login(hostname, username, password) #ssh.prompt() return ssh except pxssh.ExceptionPxssh, e: return_msg = "Unknown - pxssh failed on login. " return_msg += str(e) output_nagios(return_msg,'',return_code['UNKNOWN']) #-------------------------------------------------------------------- def ssh_logout(): ssh.logout() #-------------------------------------------------------------------- def get_failed_backups(): if arg_backupdate == "yesterday": time_current = int(time.time()) - 86400 time_since_date = ' --since ' + datetime.datetime.fromtimestamp(time_current).strftime('%Y-%m-%d') time_until_date = '' else: time_since_date = ' --since ' + arg_backupdate #convert to timestamp and add one day in seconds time_until_date_timestamp = int(time.mktime(datetime.datetime.strptime(arg_backupdate, "%Y-%m-%d").timetuple())) + 86400 #convert timestamp back to yyyy-mm-dd time_until_date = ' --until ' + datetime.datetime.fromtimestamp(time_until_date_timestamp).strftime('%Y-%m-%d') #strip out unnecessary xml tags to shrink the size ssh.sendline('svt-backup-show --output xml --max-results '+str(max_results)+' --timeout '+str(arg_timeout) + time_since_date + time_until_date+' | sed "/\(\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\|\)/d"') ssh.prompt() data = ssh.before data, rest = data.split('\n', 1) #strip out command itself return rest #-------------------------------------------------------------------- def get_hosts_all(): ssh.sendline('svt-vm-show --timeout '+str(arg_timeout)+' --output xml') ssh.prompt() data = ssh.before data, rest = data.split('\n', 1) #strip out command itself return rest #-------------------------------------------------------------------- def get_failed_backups_status(hosts_failed_backups): global return_msg return_hosts = '' return_hosts_retried = '' backup_status = 0 backup_success = [] hosts_backup_started = [] #-------------------------------------------------------------- #list failed backups try: hosts_failed_backups_xml = et.fromstring(hosts_failed_backups) for child in hosts_failed_backups_xml.findall('Backup'): backup_state = int(child.find('state').text) backup_host = child.find('hiveName').text hosts_backup_started.append(backup_host) # if the state was successfull, add these hosts to array if backup_state == 4: backup_success.append(backup_host) # if backup failed go to this tree elif backup_state == 3: # if current host has not succeeded then mark them critical if backup_host not in backup_success: failed_timestamp = int(child.find('timestamp').text) failed_timestamp = datetime.datetime.fromtimestamp(failed_timestamp).strftime('%Y-%m-%d %H:%M') return_hosts += backup_host+" ("+ str(failed_timestamp)+"), " backup_status = 2 # just add hosts for informational reasons else: return_hosts_retried += backup_host+", " except: return_msg = 'Unknown - Returned XML data for backup VMs is not valid. For example a missing root element or too much data.' return 3 #-------------------------------------------------------------- #Get all VMs because svt-backup-show doesn't list not started backups if arg_mode == "status:notstarted": try: hosts_all = get_hosts_all() hosts_all_xml = et.fromstring(hosts_all) for child in hosts_all_xml.findall('VM'): # also matches 'restore' when vm name is 'host_restore_01' if not any(host in child.find('platformName').text for host in hosts_excluded): current_host = child.find('platformName').text if current_host not in hosts_backup_started: return_hosts += current_host + " (not started), " backup_status = 2 except: return_msg = 'Unknown - Returned XML data for all VMs is not valid. For example a missing root element or too much data.' return 3 #-------------------------------------------------------------- #evaluate return values for output if backup_status == 0: return_msg = 'OK - All backups were successful' if return_hosts_retried: return_msg += '\nHosts with more than one try for successful backup:\n'+return_hosts_retried[:-2] else: return_hosts = return_hosts[:-2] #delete last 2 characters return_msg = 'Critical - Backup failed for '+return_hosts if len(return_msg) > 250: return_msg_normal = return_msg[:250] return_msg_extended = return_msg[250:] return_msg = return_msg_normal+'...\n...'+return_msg_extended+'\n' if return_hosts_retried: return_msg += '\nHosts with more than one try for successful backup:\n'+return_hosts_retried[:-2] return backup_status #-------------------------------------------------------------------- def get_hosts_with_policy_status(hosts_with_policy): global return_msg return_hosts = '' hosts_found = 0 if arg_policyname: try: hosts_with_policy_xml = et.fromstring(hosts_with_policy) for child in hosts_with_policy_xml.findall('VM'): if child.find('policy').text == arg_policyname: if child.find('platformName').text not in hosts_excluded: return_hosts += child.find('platformName').text+", " hosts_found = 1 if hosts_found == 0: return_msg = 'No hosts found with backup policy "'+arg_policyname+'"' else: return_hosts = return_hosts[:-2] return_msg = 'Hosts with backup policy "'+arg_policyname+'": '+return_hosts return 0 except: return_msg = 'Unknown - Returned XML data is not valid' return 3 else: return_msg = 'Unknown - No policy name given. Please add argument -N' return 3 ###################################################################### # General if arg_mode == "status" or arg_mode == "status:notstarted": ssh = ssh_connect(arg_hostname, arg_username, arg_password, arg_timeout) hosts_failed_backups = get_failed_backups() backup_status = get_failed_backups_status(hosts_failed_backups) ssh_logout() if backup_status == 0: output_nagios(return_msg,'',return_code['OK']) elif backup_status == 2: output_nagios(return_msg,'',return_code['CRITICAL']) else: output_nagios(return_msg,'',return_code['UNKNOWN']) elif arg_mode == "policy": ssh = ssh_connect(arg_hostname, arg_username, arg_password, arg_timeout) hosts_with_policy = get_hosts_all() no_policy_status = get_hosts_with_policy_status(hosts_with_policy) ssh_logout() if no_policy_status == 0: output_nagios(return_msg,'',return_code['OK']) elif no_policy_status == 2: output_nagios(return_msg,'',return_code['CRITICAL']) else: output_nagios(return_msg,'',return_code['UNKNOWN']) else: return_msg = 'Unknown - Please select a mode.\nType ./'+os.path.basename(__file__)+' --help for all options.' output_nagios(return_msg,'',return_code['UNKNOWN'])