# vim: tabstop=4 expandtab shiftwidth=4 autoindent import INN import datetime import logging import logging.handlers import os import os.path import re import shelve import sys import time import traceback # In Python 2.4, utils was called Utils. try: from email.utils import parseaddr except ImportError: from email.Utils import parseaddr # Python 2.4 doesn't have hashlib. try: from hashlib import md5 except ImportError: from md5 import md5 try: import configparser except ImportError: import ConfigParser configparser=ConfigParser try: from sys import intern except ImportError: pass # Keep compatibility with both Python 2 and 3, as they handle encoding in # unicode strings differently. def mem2str(string): if string is None: return '' if sys.version_info[0] > 2: return string.tobytes().decode('utf-8', 'backslashreplace') else: return str(string) def decodedStr(string): if sys.version_info[0] > 2: return string.decode('utf-8', 'backslashreplace') else: return string def encodedStr(string): if sys.version_info[0] > 2: return string.encode('utf-8', 'backslashreplace') else: return string # Define some high-level date/time functions. def now(): return datetime.datetime.utcnow() # return datetime.datetime.now() def timestamp(stamp): return stamp.strftime("%Y-%m-%d %H:%M:%S") def dateobj(datestr): """Take a string formated date (yyyymmdd) and return a datetime object. """ # Python 2.4 compatibility. return datetime.datetime(*(time.strptime(datestr, '%Y%m%d')[0:6])) # return datetime.datetime.strptime(datestr, '%Y%m%d') def nowstamp(): """A shortcut function to return a textual representation of now. """ return timestamp(now()) def last_midnight(): return now().replace(hour=0, minute=0, second=0, microsecond=0) def next_midnight(): """Return a datetime object relating to the next midnight. """ return last_midnight() + datetime.timedelta(days=1) def future(days=0, hours=0, mins=0, secs=0): return now() + datetime.timedelta(days=days, hours=hours, minutes=mins, seconds=secs) # ----- This section is concerned with setting up a default configuration. def makedir(d): """Check if a given directory exists. If it doesn't, check if the parent exists. If it does, then the new directory will be created. If not, then sensible options are exhausted and the program aborts. """ if not os.path.isdir(d): parent = os.path.dirname(d) if os.path.isdir(parent): os.mkdir(d, 0o700) sys.stdout.write("%s: Directory created.\n" % d) else: msg = "%s: Unable to make directory. Aborting.\n" % d sys.stdout.write(msg) sys.exit(1) def init_config(): # Configure the Config Parser. config = configparser.RawConfigParser() # Logging config.add_section('logging') config.set('logging', 'level', 'info') config.set('logging', 'format', '%(asctime)s %(levelname)s %(message)s') config.set('logging', 'datefmt', '%Y-%m-%d %H:%M:%S') config.set('logging', 'retain', 7) config.set('logging', 'logart_maxlines', 20) # Binary config.add_section('binary') config.set('binary', 'lines_allowed', 15) config.set('binary', 'allow_pgp', 'true') config.set('binary', 'reject_suspected', 'false') config.set('binary', 'fasttrack_references', 'true') # EMP config.add_section('emp') config.set('emp', 'ph_coarse', 'true') config.set('emp', 'body_threshold', 5) config.set('emp', 'body_ceiling', 85) config.set('emp', 'body_maxentries', 5000) config.set('emp', 'body_timed_trim', 3600) config.set('emp', 'body_fuzzy', 'true') config.set('emp', 'phn_threshold', 100) config.set('emp', 'phn_ceiling', 150) config.set('emp', 'phn_maxentries', 5000) config.set('emp', 'phn_timed_trim', 1800) config.set('emp', 'lphn_threshold', 20) config.set('emp', 'lphn_ceiling', 30) config.set('emp', 'lphn_maxentries', 200) config.set('emp', 'lphn_timed_trim', 1800) config.set('emp', 'phl_threshold', 20) config.set('emp', 'phl_ceiling', 80) config.set('emp', 'phl_maxentries', 5000) config.set('emp', 'phl_timed_trim', 3600) config.set('emp', 'fsl_threshold', 20) config.set('emp', 'fsl_ceiling', 40) config.set('emp', 'fsl_maxentries', 5000) config.set('emp', 'fsl_timed_trim', 3600) config.set('emp', 'ihn_threshold', 10) config.set('emp', 'ihn_ceiling', 15) config.set('emp', 'ihn_maxentries', 1000) config.set('emp', 'ihn_timed_trim', 7200) config.add_section('groups') config.set('groups', 'max_crosspost', 10) config.set('groups', 'max_low_crosspost', 3) config.add_section('control') config.set('control', 'reject_cancels', 'false') config.set('control', 'reject_redundant', 'true') config.add_section('filters') config.set('filters', 'newsguy', 'true') config.set('filters', 'reject_html', 'true') config.set('filters', 'reject_multipart', 'false') config.add_section('hostnames') config.set('hostnames', 'path_hostname', 'true') # The path section is a bit tricky. First off, we try to read a default # config file. This can define the path to everything, including the # pyclean.cfg config file. For this reason, all the path entries that # could generate directories need to come after the config files have # been read. config.add_section('paths') # In accordance with Debian standards, we'll look for # /etc/default/pyclean. This file can define the path for pyclean's # etc, which includes the pyclean.cfg file. The location of the # default file can be overridden by setting the 'PYCLEAN' environment # variable. if 'PYCLEAN' in os.environ: default = os.environ['PYCLEAN'] else: default = os.path.join('/', 'etc', 'default', 'pyclean') if os.path.isfile(default): config.read(default) # By default, all the paths are subdirectories of the homedir. homedir = os.path.expanduser('~') # Define the basedir for pyclean. By default this will be ~/pyclean. basedir = os.path.join(homedir, 'pyclean') # If the default file hasn't specified an etc path, we need to assume a # default. Usually /usr/local/news/pyclean/etc. if not config.has_option('paths', 'etc'): config.set('paths', 'etc', os.path.join(basedir, 'etc')) # At this point, we know the basedir is going to be required so we # attempt to create it. makedir(basedir) makedir(config.get('paths', 'etc')) # Under all circumstances, we now have an etc path. Now to check # if the config file exists and if so, read it. configfile = os.path.join(config.get('paths', 'etc'), 'pyclean.cfg') if os.path.isfile(configfile): config.read(configfile) if not config.has_option('paths', 'log'): config.set('paths', 'log', os.path.join(basedir, 'log')) # As with the etc section above, we know basedir is required now. # No harm in trying to create it multiple times. makedir(basedir) makedir(config.get('paths', 'log')) if not config.has_option('paths', 'logart'): config.set('paths', 'logart', os.path.join(basedir, 'articles')) makedir(config.get('paths', 'logart')) if not config.has_option('paths', 'lib'): config.set('paths', 'lib', os.path.join(basedir, 'lib')) makedir(config.get('paths', 'lib')) # The following lines can be uncommented in order to write a config # file. This is useful for creating an example file. # with open('example.cfg', 'wb') as configfile: # config.write(configfile) return config class InndFilter: """Provide filtering callbacks to innd. """ def __init__(self): """This runs every time the filter is loaded or reloaded. This is a good place to initialize variables and precompile regular expressions, or maybe reload stats from disk. """ self.traceback_loop = 0 try: self.pyfilter = Filter() except: fn = os.path.join(config.get('paths', 'log'), 'init_traceback') f = open(fn, 'a') traceback.print_exc(file=f) f.close() def filter_before_reload(self): """Runs just before the filter gets reloaded. You can use this method to save state information to be restored by the __init__() method or down in the main module. """ try: self.pyfilter.closetasks() logging.info("Re-reading config file") global config config = init_config() except: fn = os.path.join(config.get('paths', 'log'), 'close_traceback') f = open(fn, 'a') traceback.print_exc(file=f) f.close() return "" def filter_close(self): """Runs when innd exits. You can use this method to save state information to be restored by the __init__() method or down in the main module. """ try: self.pyfilter.closetasks() except: fn = os.path.join(config.get('paths', 'log'), 'close_traceback') f = open(fn, 'a') traceback.print_exc(file=f) f.close() return "" def filter_messageid(self, msgid): """Filter articles just by their Message-IDs. This method interacts with the CHECK, IHAVE and TAKETHIS NNTP commands. If you return a non-empty string here, the offered article will be refused before you ever have to waste any bandwidth looking at it (unless TAKETHIS is used before an earlier CHECK). Make sure that such a message is properly encoded in UTF-8 so as to comply with the NNTP protocol. """ return "" def filter_art(self, art): """Decide whether to keep offered articles. art is a dictionary with a bunch of header fields, the article's body, and innd's reckoning of the line count. Items not in the article will have a value of None. The available header fields are the ones listed near the top of innd/art.c. At this writing, they are: Also-Control, Approved, Archive, Archived-At, Bytes, Cancel-Key, Cancel-Lock, Comments, Content-Base, Content-Disposition, Content-Transfer-Encoding, Content-Type, Control, Date, Date-Received, Distribution, Expires, Face, Followup-To, From, In-Reply-To, Injection-Date, Injection-Info, Jabber-ID, Keywords, Lines, List-ID, Message-ID, MIME-Version, Newsgroups, NNTP-Posting-Date, NNTP-Posting-Host, NNTP-Posting-Path, Organization, Original-Sender, Originator, Path, Posted, Posting-Version, Received, References, Relay-Version, Reply-To, Sender, Subject, Summary, Supersedes, User-Agent, X-Auth, X-Auth-Sender, X-Canceled-By, X-Cancelled-By, X-Complaints-To, X-Face, X-HTTP-UserAgent, X-HTTP-Via, X-Mailer, X-Modbot, X-Modtrace, X-Newsposter, X-Newsreader, X-No-Archive, X-Original-Message-ID, X-Original-NNTP-Posting-Host, X-Original-Trace, X-Originating-IP, X-PGP-Key, X-PGP-Sig, X-Poster-Trace, X-Postfilter, X-Proxy-User, X-Submissions-To, X-Trace, X-Usenet-Provider, X-User-ID, Xref. The body is the buffer in art[__BODY__] and the INN-reckoned line count is held as an integer in art[__LINES__]. (The Lines header field is often generated by the poster, and large differences can be a good indication of a corrupt article.) If you want to keep an article, return None or "". If you want to reject, return a non-empty string. The rejection string will appear in transfer and posting response banners, and local posters will see them if their messages are rejected (make sure that such a response is properly encoded in UTF-8 so as to comply with the NNTP protocol). """ try: return self.pyfilter.filter(art) except: if not self.traceback_loop: fn = os.path.join(config.get('paths', 'log'), 'traceback') f = open(fn, 'a') traceback.print_exc(file=f) f.close() self.traceback_loop = 1 return "" def filter_mode(self, oldmode, newmode, reason): """Capture server events and do something useful. When the admin throttles or pauses innd (and lets it go again), this method will be called. oldmode is the state we just left, and newmode is where we are going. reason is usually just a comment string. The possible values of newmode and oldmode are the five strings "running", "paused", "throttled", "shutdown" and "unknown". Actually "unknown" shouldn't happen; it's there in case feeping creatures invade innd. """ INN.syslog('n', 'state change from %s to %s - %s' % (oldmode, newmode, reason)) class Binary: """Perform binary content checking of articles. """ def __init__(self): # Binaries self.regex_yenc = re.compile('^=ybegin.*', re.M) self.regex_uuenc = re.compile('^begin[ \t]+\d{3,4}[ \t]+\w+\.\w', re.M) self.regex_base64 = re.compile('[a-zA-Z0-9+/]{59}') self.regex_numeric = re.compile('[0-9]{59}') self.regex_binary = re.compile('[ \t]*\S{40}') # Feedhosts keeps a tally of how many binary articles are received # from each upstream peer. self.feedhosts = {} self.tagged = 0 def increment(self, pathhost): """Increment feedhosts. """ if pathhost in self.feedhosts: self.feedhosts[pathhost] += 1 else: self.feedhosts[pathhost] = 1 def report(self): fn = os.path.join(config.get('paths', 'log'), 'binfeeds') f = open(fn, 'w') f.write('# Binary feeders report - %s\n\n' % nowstamp()) for e in self.feedhosts.keys(): f.write('%s: %s\n' % (e, self.feedhosts[e])) f.close() self.feedhosts = {} def isbin(self, art): """The primary function of the Binary class. An article's body is compared against a number of checks. If the conclusion is that the payload is binary, the type of binary is returned. Non-binary content will return False. """ # Ignore base64 encoded content. if (art[Content_Transfer_Encoding] is not None and 'base64' in mem2str(art[Content_Transfer_Encoding]).lower()): return False if self.regex_uuenc.search(mem2str(art[__BODY__])): return 'uuEnc' yenc = self.regex_yenc.search(mem2str(art[__BODY__])) if yenc: # Extract the matching line l = yenc.group(0) if 'line=' in l and 'size=' in l and 'name=' in l: return 'yEnc' # Avoid costly checks where articles are shorter than the allowed # number of binary lines. if int(art[__LINES__]) < config.getint('binary', 'lines_allowed'): return False # Also avoid these costly checks where a References header field # is present. skip_refs = (art[References] is not None and mem2str(art[References]).startswith('<') and config.getboolean('binary', 'fasttrack_references') and int(art[__LINES__]) > 500) if skip_refs: return False # Base64 and suspect binary matching. b64match = 0 suspect = 0 for line in mem2str(art[__BODY__]).split('\n'): skip_pgp = (line.startswith('-----BEGIN PGP') and config.getboolean('binary', 'allow_pgp')) if skip_pgp: break if line == "-- ": # Don't include signatures in binary testing. break # Resetting the next counter to zero on a non-matching line # dictates the counted binary lines must be consecutive. We also # test that a numeric line doesn't trigger a Base64 match. if (self.regex_base64.match(line) and not self.regex_numeric.match(line)): b64match += 1 else: b64match = 0 if self.regex_binary.match(line): suspect += 1 else: suspect = 0 if b64match > config.get('binary', 'lines_allowed'): return 'base64' if suspect > config.get('binary', 'lines_allowed'): return 'binary' return False class Filter: def __init__(self): """This runs every time the filter is loaded or reloaded. This is a good place to initialize variables and precompile regular expressions, or maybe reload stats from disk. """ # Initialize Group Analyzer. self.groups = Groups() # Initialize Binary Filters. self.binary = Binary() # Posting Host and Posting Account. self.regex_ph = re.compile('posting-host *= *"?([^";]+)') self.regex_pa = re.compile('posting-account *= *"?([^";]+)') # Match lines in regex_files formated /regex/ timestamp(YYYYMMDD). self.regex_fmt = re.compile('/(.+)/[ \t]+(\d{8})') # A dictionary of files containing regexs that need to be reloaded and # compiled if the timestamp on them changes. The dict content is the # timestamp (initially zeroed). regex_file_list = [ 'bad_body', 'bad_cp_groups', 'bad_crosspost_host', 'bad_from', 'bad_groups', 'bad_groups_dizum', 'bad_posthost', 'bad_subject', 'good_posthost', 'ihn_hosts', 'local_bad_body', 'local_bad_cp_groups', 'local_bad_from', 'local_bad_groups', 'local_bad_subject', 'local_hosts', 'log_from'] # Each regex_files key contains a timestamp of last-modified time. # Setting all keys to zero ensures they are processed on first run. regex_files = dict((f, 0) for f in regex_file_list) # Python >= 2.7 has dict comprehension but not earlier versions # regex_files = {f: 0 for f in regex_file_list} self.regex_files = regex_files # A dict of the regexs compiled from the regex_files defined above. self.etc_re = {} # Hostname - Not a 100% perfect regex but probably good enough. self.regex_hostname = re.compile('([a-zA-Z0-9]|[a-zA-Z0-9]' '[a-zA-Z0-9\-]+[a-zA-Z0-9])' '(\.[a-zA-Z0-9\-]+)+') # Path replacement regexs. self.regex_pathhost = re.compile('(![^\.]+)+$') # Strip RH non-FQDNs. # Match email addresses. self.regex_email = \ re.compile('([\w\-][\w\-\.]*)@[\w\-][\w\-\.]+[a-zA-Z]{1,4}') # Colon/Space separated fields. self.regex_fields = re.compile('[ \t]*([^:]+):[ \t]+(\S+)') # Content-Type: text/plain; charset=utf-8 self.regex_ct = re.compile("\s*([^;]+)") self.regex_ctcs = re.compile('charset="?([^"\s;]+)') # Symbol matching for ratio-based rejects. self.regex_symbols = re.compile("\\_/ |_\|_|[a-z]\.{2,}[a-z]") # Match lines that start with a CR. self.regex_crspace = re.compile("^\r ", re.MULTILINE) # Redundant control message types. self.redundant_controls = ['sendsys', 'senduuname', 'version', 'whogets'] # Set up the EMP filters. self.emp_body = EMP(name='emp_body', threshold=config.getint('emp', 'body_threshold'), ceiling=config.getint('emp', 'body_ceiling'), maxentries=config.getint('emp', 'body_maxentries'), timedtrim=config.getint('emp', 'body_timed_trim'), dofuzzy=config.getboolean('emp', 'body_fuzzy')) self.emp_phn = EMP(name='emp_phn', threshold=config.getint('emp', 'phn_threshold'), ceiling=config.getint('emp', 'phn_ceiling'), maxentries=config.getint('emp', 'phn_maxentries'), timedtrim=config.getint('emp', 'phn_timed_trim')) self.emp_lphn = EMP(name='emp_lphn', threshold=config.getint('emp', 'lphn_threshold'), ceiling=config.getint('emp', 'lphn_ceiling'), maxentries=config.getint('emp', 'lphn_maxentries'), timedtrim=config.getint('emp', 'lphn_timed_trim')) self.emp_phl = EMP(name='emp_phl', threshold=config.getint('emp', 'phl_threshold'), ceiling=config.getint('emp', 'phl_ceiling'), maxentries=config.getint('emp', 'phl_maxentries'), timedtrim=config.getint('emp', 'phl_timed_trim')) self.emp_fsl = EMP(name='emp_fsl', threshold=config.getint('emp', 'fsl_threshold'), ceiling=config.getint('emp', 'fsl_ceiling'), maxentries=config.getint('emp', 'fsl_maxentries'), timedtrim=config.getint('emp', 'fsl_timed_trim')) self.emp_ihn = EMP(name='emp_ihn', threshold=config.getint('emp', 'ihn_threshold'), ceiling=config.getint('emp', 'ihn_ceiling'), maxentries=config.getint('emp', 'ihn_maxentries'), timedtrim=config.getint('emp', 'ihn_timed_trim')) # Initialize timed events. self.hourly_events(startup=True) # Set a datetime object for next midnight. self.midnight_trigger = next_midnight() def filter(self, art): # Initialize the posting info dict. post = {} # Trigger timed reloads. if now() > self.hourly_trigger: self.hourly_events() if now() > self.midnight_trigger: self.midnight_events() # Attempt to split the From address into component parts. if art[From] is not None: post['from_name'], \ post['from_email'] = self.addressParse(mem2str(art[From])) if art[Content_Type] is not None: ct = self.regex_ct.match(mem2str(art[Content_Type])) if ct: post['content_type'] = ct.group(1).lower() ctcs = self.regex_ctcs.search(mem2str(art[Content_Type])) if ctcs: post['charset'] = ctcs.group(1).lower() # Try to establish the injection-host, posting-host and # posting-account. if art[Injection_Info] is not None: # Establish Posting Account. ispa = self.regex_pa.search(mem2str(art[Injection_Info])) if ispa: post['posting-account'] = ispa.group(1) # Establish Posting Host. isph = self.regex_ph.search(mem2str(art[Injection_Info])) if isph: post['posting-host'] = isph.group(1) # Establish Injection Host. isih = self.regex_hostname.match(mem2str(art[Injection_Info])) if isih: post['injection-host'] = isih.group(0) # posting-host might be obtainable from NNTP-Posting-Host. if 'posting-host' not in post and art[NNTP_Posting_Host] is not None: post['posting-host'] = mem2str(art[NNTP_Posting_Host]) # If the injection-host wasn't found in Injection-Info, try the X-Trace # header field. We only look for a hostname as the first field in # X-Trace, otherwise it's regex hell. if 'injection-host' not in post and art[X_Trace] is not None: isih = self.regex_hostname.match(mem2str(art[X_Trace])) if isih: post['injection-host'] = isih.group(0) # Try to extract a hostname from the Path header field. if config.getboolean('hostnames', 'path_hostname'): # First, check for a !.POSTED tag, as per RFC5537. if 'injection-host' not in post and "!.POSTED" in mem2str( art[Path] ): postsplit = mem2str(art[Path]).split("!.POSTED", 1) pathhost = postsplit[0].split("!")[-1] if pathhost: post['injection-host'] = pathhost # Last resort, try the right-most entry in the Path header field. if 'injection-host' not in post: subhost = re.sub(self.regex_pathhost, '', mem2str(art[Path])) pathhost = subhost.split("!")[-1] if pathhost: post['injection-host'] = pathhost # Some services (like Google) use dozens of Injection Hostnames. # This section looks for substring matches and replaces the entire # Injection-Host with the substring. if 'injection-host' in post: for ihsub in self.ihsubs: if ihsub in post['injection-host']: logging.debug("Injection-Host: Replacing %s with %s", post['injection-host'], ihsub) post['injection-host'] = ihsub # Ascertain if the posting-host is meaningful. if 'posting-host' in post: isbad_ph = self.groups.regex.bad_ph.search(post['posting-host']) if isbad_ph: post['bad-posting-host'] = isbad_ph.group(0) logging.debug('Bad posting host: %s', post['bad-posting-host']) # Dizum deserves a scalar all to itself! dizum = False if ('injection-host' in post and post['injection-host'] == 'sewer.dizum.com'): dizum = True # The host that fed us this article is first in the Path header field. post['feed-host'] = mem2str(art[Path]).split('!', 1)[0] # Analyze the Newsgroups header field. self.groups.analyze(art[Newsgroups], art[Followup_To]) # Is the source of the post considered local? local = False if ('injection-host' in post and 'local_hosts' in self.etc_re and self.etc_re['local_hosts'].search(post['injection-host'])): local = True # --- Everything below is accept / reject code --- # Reject any messages that don't have a Message-ID. if art[Message_ID] is None: logging.warn("Wot no Message-ID! Rejecting message because the " "implications of accepting it are unpredictable.") return self.reject(art, post, "No Message-ID header") # We use Message-ID strings so much, it's useful to have a shortcut. mid = mem2str(art[Message_ID]) # Now we're convinced we have a MID, log it for local posts. if local: logging.debug("Local post: %s", mid) # Control message handling. if art[Control] is not None: ctrltype = mem2str(art[Control]).split(" ", 1)[0] # Reject control messages with a Supersedes header field. if art[Supersedes] is not None: return self.reject( art, post, 'Control %s with Supersedes header field' % ctrltype) if (ctrltype == 'cancel' and config.getboolean('control', 'reject_cancels')): return self.reject(art, post, "Control cancel") elif (ctrltype in self.redundant_controls and config.getboolean('control', 'reject_redundant')): return self.reject( art, post, "Redundant Control Type: %s" % ctrltype) else: logging.info('Control: %s, mid=%s' % (mem2str(art[Control]), mid)) return '' # No followups is a reject as is more than 2 groups. if self.groups['futcount'] < 1 or self.groups['futcount'] > 2: # Max-crosspost check. if self.groups['count'] > config.get('groups', 'max_crosspost'): return self.reject(art, post, "Crosspost Limit Exceeded") # Max low crosspost check. if self.groups['count'] > config.get('groups', 'max_low_crosspost'): if self.groups['lowcp'] > 0: return self.reject(art, post, "Crosspost Low Limit Exceeded") # Lines check. if art[Lines] and int(art[Lines]) != int(art[__LINES__]): logmes = "Lines Mismatch: Header=%s, INN=%s, mid=%s" if art[User_Agent] is not None: logmes += ", Agent=%s" logging.debug(logmes % (mem2str(art[Lines]), str(art[__LINES__]), mid, mem2str(art[User_Agent]))) else: logging.debug(logmes % (mem2str(art[Lines]), str(art[__LINES__]), mid)) # Newsguy are evil sex spammers. if ('newsguy.com' in mid and config.getboolean('filters', 'newsguy') and 'sex_groups' in self.groups and self.groups['sex_groups'] > 0): return self.reject(art, post, "Newsguy Sex") # For some reason, this OS2 group has become kook central. if ('comp.os.os2.advocacy' in self.groups['groups'] and self.groups['count'] > 1): return self.reject(art, post, "OS2 Crosspost") if (art[Followup_To] and 'comp.os.os2.advocacy' in mem2str(art[Followup_To])): return self.reject(art, post, "OS2 Followup") # Poor snipe is getting the Greg Hall treatment. if 'injection-host' in post: if post['injection-host'].startswith( "snipe.eternal-september.org"): pass # self.logart("Snipe Post", art, post, 'log_snipe') else: if ("sn!pe" in post['from_name'] or "snipeco" in post['from_email']): return self.reject(art, post, "Snipe Forge") # Compare headers against regex files. # Check if posting-host is whitelisted. gph = False if('posting-host' in post and 'good_posthost' in self.etc_re): gph = self.etc_re['good_posthost'].search(post['posting-host']) if gph: logging.info("Whitelisted posting. host=%s, msgid=%s", post['posting-host'], mem2str(art[Message_ID])) # Reject these posting-hosts. if ('posting-host' in post and not gph and 'bad_posting-host' not in post and 'bad_posthost' in self.etc_re): bph = self.etc_re['bad_posthost'].search(post['posting-host']) if bph: return self.reject( art, post, "Bad Posting-Host (%s)" % bph.group(0)) # Test posting-hosts that are not allowed to crosspost. if ('posting-host' in post and not gph and self.groups['count'] > 1 and 'bad_crosspost_host' in self.etc_re): ph = post['posting-host'] bph = self.etc_re['bad_crosspost_host'].search(ph) if bph: return self.reject( art, post, "Bad Crosspost Host (%s)" % bph.group(0)) # Groups where crossposting is not allowed. if (self.groups['count'] > 1 and not gph and 'bad_cp_groups' in self.etc_re): bcg = self.etc_re['bad_cp_groups'].search(mem2str(art[Newsgroups])) if bcg: return self.reject( art, post, "Bad Crosspost Group (%s)" % bcg.group(0)) if 'log_from' in self.etc_re: lf_result = self.etc_re['log_from'].search(mem2str(art[From])) if lf_result: self.logart(lf_result.group(0), art, post, 'log_from', trim=False) if 'bad_groups' in self.etc_re and not gph: bg_result = self.etc_re['bad_groups'].search( mem2str(art[Newsgroups]) ) if bg_result: return self.reject( art, post, "Bad Group (%s)" % bg_result.group(0)) if dizum and 'bad_groups_dizum' in self.etc_re: bgd = self.etc_re['bad_groups_dizum'].search( mem2str(art[Newsgroups]) ) if bgd: return self.reject( art, post, "Bad Dizum Group (%s)" % bgd.group(0)) # AUK bad crossposts. #if self.groups['kooks'] > 0: # if ('alt.free.newsservers' in self.groups['groups'] or # 'alt.privacy.anon-server' in self.groups['groups']): # return self.reject(art, post, "AUK Bad Crosspost") if 'bad_from' in self.etc_re and not gph: bf_result = self.etc_re['bad_from'].search(mem2str(art[From])) if bf_result: return self.reject( art, post, "Bad From (%s)" % bf_result.group(0)) # Bad subject checking (Currently only on Dizum posts). if dizum and 'bad_subject' in self.etc_re and not gph: bs_result = self.etc_re['bad_subject'].search( mem2str(art[Subject]) ) if bs_result: return self.reject( art, post, "Bad Subject (%s)" % bs_result.group(0)) if 'bad_body' in self.etc_re and not gph: bb_result = self.etc_re['bad_body'].search(mem2str(art[__BODY__])) if bb_result: return self.reject( art, post, "Bad Body (%s)" % bb_result.group(0), "Bad Body") # The following checks are for locally posted articles. # Groups where crossposting is not allowed. if (local and not gph and self.groups['count'] > 1 and 'local_bad_cp_groups' in self.etc_re): b = self.etc_re['local_bad_cp_groups'].search( mem2str(art[Newsgroups]) ) if b: return self.reject( art, post, "Local Bad Crosspost Group (%s)" % b.group(0)) # Local Bad From. if local and not gph and 'local_bad_from' in self.etc_re: reg = self.etc_re['local_bad_from'] bf_result = reg.search(mem2str(art[From])) if bf_result: return self.reject( art, post, "Local Bad From (%s)" % bf_result.group(0), "Local Reject") # Local Bad Subject. if local and not gph and 'local_bad_subject' in self.etc_re: reg = self.etc_re['local_bad_subject'] bs_result = reg.search(mem2str(art[Subject])) if bs_result: return self.reject( art, post, "Local Bad Subject (%s)" % bs_result.group(0), "Local Reject") # Local Bad Groups. if local and not gph and 'local_bad_groups' in self.etc_re: reg = self.etc_re['local_bad_groups'] bg_result = reg.search(mem2str(art[Newsgroups])) if bg_result: return self.reject( art, post, "Local Bad Group (%s)" % bg_result.group(0)) # Local Bad Body. if local and not gph and 'local_bad_body' in self.etc_re: reg = self.etc_re['local_bad_body'] bb_result = reg.search(mem2str(art[__BODY__])) if bb_result: return self.reject( art, post, "Local Bad Body (%s)" % bb_result.group(0), "Local Reject") # Misplaced binary check. if self.groups['bin_allowed_bool']: # All groups in the post match bin_allowed groups. isbin = False else: # Potentially expensive check if article contains binary. isbin = self.binary.isbin(art) # Generic 'binary' means it looks binary-like but doesn't match any # known encoding method. if isbin == 'binary': if config.getboolean('binary', 'reject_suspected'): return self.reject( art, post, "Binary (%s)" % isbin) else: self.logart("Binary Suspect", art, post, "bin_suspect", trim=False) elif isbin: self.binary.increment(post['feed-host']) return self.reject( art, post, "Binary (%s)" % isbin) # Misplaced HTML check. if (not self.groups['html_allowed_bool'] and config.getboolean('filters', 'reject_html') and 'content_type' in post): if 'text/html' in post['content_type']: return self.reject(art, post, "HTML Misplaced") if 'multipart' in post['content_type']: if config.getboolean('filters', 'reject_multipart'): return self.reject(art, post, "MIME Multpart") else: logging.debug('Multipart: %s' % mid) # Symbol ratio test. symlen = len(self.regex_symbols.findall(mem2str(art[__BODY__]))) if symlen > 100: return self.reject(art, post, "Symbols (%s)" % symlen) # Start of EMP checks. if (not self.groups['emp_exclude_bool'] and not self.groups['test_bool']): ngs = ','.join(self.groups['groups']) # If a substring matches the Newsgroups header field, use just that # substring as EMP fodder where a Newsgroups name is normally used. for ngsub in self.ngsubs: if ngsub in ngs: logging.debug("Newsgroup substring match: %s", ngsub) ngs = ngsub break # Start of posting-host based checks. # First try and seed some filter fodder. if 'posting-account' in post: # If a Posting-Account is known, it makes better filter fodder # than the hostname/address which could be dynamic. fodder = post['posting-account'] elif 'bad-posting-host' in post: # If we can't trust the info in posting-host, use the # injection-host. This is a worst-case scenario. if ('injection-host' in post and config.getboolean('emp', 'ph_coarse')): fodder = post['injection-host'] else: fodder = None elif 'posting-host' in post: fodder = post['posting-host'] else: fodder = None if fodder: # Beginning of PHN filters. if 'moderated' in self.groups and self.groups['moderated']: logging.debug("Bypassing PHN filter due to moderated " "group in distribution") elif local: # Beginning of PHN_Local filter. do_lphn = True if self.groups['phn_exclude_bool']: do_lphn = False if (not do_lphn and art[References] is None and art[Subject] is not None and mem2str(art[Subject]).startswith("Re:")): logging.info("emp_lphn: Exclude overridden - " "Subject Re but no Reference") do_lphn = True if (not do_lphn and self.regex_crspace.search(mem2str(art[__BODY__]))): logging.info("emp_lphn: Exclude overridden - " "Carriage Return starts line") do_lphn = True if do_lphn and self.emp_lphn.add(fodder + ngs): return self.reject(art, post, "EMP Local PHN Reject") else: # Beginning of standard PHN filter. if self.groups['phn_exclude_bool']: logging.debug("emp_phn exclude for: %s", mem2str(art[Newsgroups])) elif self.emp_phn.add(fodder + ngs): return self.reject(art, post, "EMP PHN Reject") # Beginning of PHL filter. if self.emp_phl.add(fodder + str(art[__LINES__])): return self.reject(art, post, "EMP PHL Reject") # Beginning of FSL filter. fsl = mem2str(art[From]) + mem2str(art[Subject]) \ + str(art[__LINES__]) if self.emp_fsl.add(fsl): return self.reject(art, post, "EMP FSL Reject") # Beginning of IHN filter. if ('injection-host' in post and 'ihn_hosts' in self.etc_re and not self.groups['ihn_exclude_bool']): ihn_result = self.etc_re['ihn_hosts']. \ search(post['injection-host']) if ihn_result: logging.debug("emp_ihn hit: %s", ihn_result.group(0)) if self.emp_ihn.add(post['injection-host'] + ngs): return self.reject(art, post, "EMP IHN Reject") # Beginning of EMP Body filter. Do this last, it's most # expensive in terms of processing. if art[__BODY__] is not None: if self.emp_body.add(mem2str(art[__BODY__])): return self.reject(art, post, "EMP Body Reject") if local: # All tests passed. Log the locally posted message. logging.info("post: mid=%s, from=%s, groups=%s", mem2str(art[Message_ID]), mem2str(art[From]), mem2str(art[Newsgroups])) self.logart('Local Post', art, post, 'local_post') # The article passed all checks. Return an empty string. return "" def addressParse(self, addr): name, email = parseaddr(addr) return name.lower(), email.lower() def xreject(self, reason, art, post): for logrule in self.log_rules.keys(): if reason.startswith(logrule): self.logart(reason, art, post, self.log_rules[logrule]) break logging.info("reject: mid=%s, reason=%s" % (mem2str(art[Message_ID]), reason)) return reason def reject(self, art, post, reason, short_reason=None): rulehit = False for logrule in self.log_rules.keys(): if reason.startswith(logrule): self.logart(reason, art, post, self.log_rules[logrule]) rulehit = True break if rulehit: logging.info("reject: mid=%s, reason=%s" % ( mem2str(art[Message_ID]), reason)) else: msg = "reject: No matched logging rule: mid={}, reason={}" logging.warn(msg.format(mem2str(art[Message_ID]), reason)) if short_reason is None: # Sometimes we don't want to provide the source with a detailed # reason of why a message was rejected. They could then just # tweak their payload to circumvent the cause. return reason return short_reason def logart(self, reason, art, post, filename, trim=True): fullname = os.path.join(config.get('paths', 'logart'), filename) if sys.version_info[0] > 2: f = open(fullname, 'a', encoding='utf-8', errors='backslashreplace') else: f = open(fullname, 'a') f.write('From foo@bar Thu Jan 1 00:00:01 1970\n') f.write('Info: %s\n' % reason) for hdr in art.keys(): if hdr == '__BODY__' or hdr == '__LINES__' or art[hdr] is None: continue f.write('%s: %s\n' % (hdr, mem2str(art[hdr]).replace('\r', ''))) for hdr in post.keys(): f.write('%s: %s\n' % (hdr, post[hdr])) f.write('\n') if (not trim or int(art[__LINES__]) <= config.get('logging', 'logart_maxlines')): f.write(mem2str(art[__BODY__]).replace('\r', '')) else: maxlines = config.get('logging', 'logart_maxlines') loglines = 0 for line in mem2str(art[__BODY__]).split('\n', 1000)[:-1]: # Ignore quoted lines. if line.startswith(">"): continue line = line.replace('\r', '') f.write(line + "\n") loglines += 1 if loglines >= maxlines: f.write('[snip]') break f.write('\n\n') f.close def hourly_events(self, startup=False): """Carry out hourly events. Some of these events may be to check if it's time to do other, less frequent events. Timed events are also triggered on startup. The "startup" flag enables special handling of this instance. """ if startup: logging.info('Performing startup tasks') else: logging.info('Performing hourly tasks') self.emp_body.statlog() self.emp_fsl.statlog() self.emp_phl.statlog() self.emp_phn.statlog() self.emp_lphn.statlog() self.emp_ihn.statlog() # Reload logging directives. self.log_rules = self.file2dict('log_rules') logging.info('Reloaded %s logging directives', len(self.log_rules)) # Reload Injection-Host substrings. self.ihsubs = self.file2list('ih_substrings') logging.info('Reloaded %s Injection-Host substrings', len(self.ihsubs)) self.ngsubs = self.file2list('ng_emp_subst') logging.info('Reloaded %s Newsgroup substrings', len(self.ngsubs)) # Set up Regular Expressions. for fn in self.regex_files.keys(): new_regex = self.regex_file(fn) if new_regex: self.etc_re[fn] = new_regex if not startup: # Re-read the config file. configfile = os.path.join(config.get('paths', 'etc'), 'pyclean.cfg') logging.info("Reloading config file: %s" % configfile) if os.path.isfile(configfile): config.read(configfile) else: logging.info("%s: File not found. Using defaults settings." % configfile) # Reset the next timed trigger. self.hourly_trigger = future(hours=1) def midnight_events(self): """Events that need to occur at midnight each day. """ logging.info('Performing midnight tasks') self.binary.report() self.emp_body.reset() self.emp_fsl.reset() self.emp_phl.reset() self.emp_phn.reset() self.emp_lphn.reset() self.emp_ihn.reset() # Set the midnight trigger for next day. self.midnight_trigger = next_midnight() def regex_file(self, filename): """Read a given file and return a regular expression composed of individual regex's on each line that have not yet expired. """ logging.debug('Testing %s regex condition', filename) fqfn = os.path.join(config.get('paths', 'etc'), filename) if not os.path.isfile(fqfn): logging.info('%s: Regex file not found' % filename) if filename in self.etc_re: logging.info("%s: Regex file has been deleted", filename) # The file has been deleted so delete the regex. self.etc_re.pop(filename, None) # Reset the last_modified date to zero. self.regex_files[filename] = 0 return False current_mod_stamp = os.path.getmtime(fqfn) recorded_mod_stamp = self.regex_files[filename] if current_mod_stamp <= recorded_mod_stamp: logging.info('%s: File not modified so not recompiling', filename) return False # The file has been modified: recompile the regex. logging.info('%s: Recompiling Regular Expression.', filename) # Reset the file's modstamp. self.regex_files[filename] = current_mod_stamp # Make a local datetime object for now, just to save setting now in # the coming loop. bad_items = [] n = now() f = open(fqfn, 'r') for line in f: if len(line.strip()) == 0: # Ignore blank lines continue if line.lstrip().startswith('#'): # Ignore comments continue valid = self.regex_fmt.match(line) if not valid: logging.warn("{}: Invalid line: {}".format(filename, line)) continue try: expire = dateobj(valid.group(2)) except ValueError: # Invalid expiry timestamp logging.warn("{}: Invalid timestamp in: {}".format(filename, line)) continue # Is current time beyond that of the datestamp? If it is, the # entry is considered expired. if n > expire: logging.debug("{}: Expired entry: {}".format(filename, valid.group(1))) continue # If processing gets here, the entry is a valid, unexpired regex. bad_items.append(valid.group(1)) f.close() num_bad_items = len(bad_items) if num_bad_items == 0: # No valid entries exist in the file. logging.debug("{}: No valid entries found".format(filename)) return False regex_string = '|'.join(bad_items) # This should never happen but best to check as || will match everything. regex_string = regex_string.replace('||', '|') try: regex = re.compile(regex_string) except re.error as e: logging.warn("{}: Regular Expression compilation failed with: {}".format(filename, e)) return False logging.info("{}: Compiled {} rules".format(filename, num_bad_items)) return regex def file2list(self, filename): fqfn = os.path.join(config.get('paths', 'etc'), filename) if not os.path.isfile(fqfn): logging.info('%s: File not found' % filename) return [] f = open(fqfn, 'r') lines = f.readlines() f.close() valid = [] for line in lines: # Strip comments (including inline). content = line.split('#', 1)[0].strip() # Ignore empty lines. if len(content) > 0: valid.append(content) return valid def file2dict(self, filename, numeric=False): """Read a file and split each line at the first space encountered. The first element is the key, the rest is the content. If numeric is True, then only integer values will be acccepted. """ d = {} for line in self.file2list(filename): valid = self.regex_fields.match(line) if valid: k = valid.group(1) c = valid.group(2) if numeric: try: c = int(c) except ValueError: c = 0 d[k] = c return d def closetasks(self): """Things to do on filter closing. """ logging.info("Running shutdown tasks") # Write to file any entries in the stack self.emp_body.dump() self.emp_fsl.dump() self.emp_phl.dump() self.emp_phn.dump() self.emp_lphn.dump() self.emp_ihn.dump() class Groups: def __init__(self): self.regex = Regex() # List of tests (that will become zeroed dict items). self.grps = [ 'bin_allowed', 'emp_exclude', 'html_allowed', 'ihn_exclude', 'kooks', 'lowcp', 'moderated', 'phn_exclude', 'sex_groups', 'test' ] def __getitem__(self, grptest): return self.grp[grptest] def __contains__(self, item): if item in self.grp: return True return False def analyze(self, newsgroups, followupto): newsgroups = mem2str(newsgroups) if followupto: followupto = mem2str(followupto) # Zero all dict items we'll use in this post. grp = dict((f, 0) for f in self.grps) # This will become a list of newsgroups. nglist = [] for ng in newsgroups.lower().split(','): # Strip whitespace from individual Newsgroups. ng = ng.strip() # Populate a list of newsgroups after stripping spaces. nglist.append(ng) if self.regex.test.search(ng): grp['test'] += 1 if self.regex.bin_allowed.search(ng): grp['bin_allowed'] += 1 if self.regex.emp_exclude.search(ng): grp['emp_exclude'] += 1 if self.regex.ihn_exclude.search(ng): grp['ihn_exclude'] += 1 if self.regex.html_allowed.search(ng): grp['html_allowed'] += 1 if self.regex.kooks.search(ng): grp['kooks'] += 1 if self.regex.lowcp.search(ng): grp['lowcp'] += 1 if self.regex.phn_exclude.search(ng): grp['phn_exclude'] += 1 if self.regex.sex_groups.search(ng): grp['sex_groups'] += 1 if INN.newsgroup(ng) == 'm': grp['moderated'] += 1 # Not all bools will be meaningful but it's easier to create them # generically then specifically. count = len(nglist) for ngelement in list(grp.keys()): ngbool = '%s_bool' % ngelement grp[ngbool] = grp[ngelement] == count grp['groups'] = sorted(nglist) grp['count'] = count # Create a list of Followup-To groups and count them. grp['futcount'] = 0 if followupto is not None: futlist = followupto.lower().split(',') grp['futcount'] = len(futlist) self.grp = grp class Regex: def __init__(self): # Test groups. test = ['\.test(ing)?(?:$|\.)', '^es\.pruebas', '^borland\.public\.test2', '^cern\.testnews'] self.test = self.regex_compile(test) # Binary groups. bin_allowed = ['^bin[a.]', '\.bin[aei.]', '\.bin$', '^fur\.artwork', '^alt\.anonymous\.messages$', '^de\.alt\.dateien', '^rec\.games\.bolo$', '^comp\.security\.pgp\.test$', '^sfnet\.tiedostot', '^fido\.', '^unidata\.', '^alt\.security\.keydist', '^mailing\.', '^linux\.', '^lucky\.freebsd', '^gnus\.', '\.lists\.freebsd\.'] self.bin_allowed = self.regex_compile(bin_allowed) html_allowed = ['^pgsql\.', '^relcom\.', '^gmane\.', '^microsoft\.', '^mailing\.', '^gnus\.'] self.html_allowed = self.regex_compile(html_allowed) # Exclude from all EMP filters. emp_exclude = ['^alt\.anonymous\.messages', '^free\.', '^local\.', '^relcom\.', '^mailing\.', '^fa\.', '\.cvs\.', '^gnu\.', 'lists\.freebsd\.ports\.bugs'] self.emp_exclude = self.regex_compile(emp_exclude) # Exclude groups from IHN filter. ihn_exclude = ['^alt\.anonymous', '^alt\.privacy', '^alt\.prophecies\.nostradamus'] self.ihn_exclude = self.regex_compile(ihn_exclude) # Exclude groups from PHN filter. phn_exclude = ['^alt\.privacy\.', '^news\.lists\.filters'] self.phn_exclude = self.regex_compile(phn_exclude) # Bad posting-hosts. bad_ph = ['newsguy\.com', 'tornevall\.net'] self.bad_ph = self.regex_compile(bad_ph) # Sex groups. sex_groups = ['^alt\.sex'] self.sex_groups = self.regex_compile(sex_groups) # Kook groups. kooks = ['^alt\.usenet\.kooks', '^alt\.checkmate', '^alt\.fan\.cyberchicken', '^alt\.fan\.karl-malden\.nose'] self.kooks = self.regex_compile(kooks) # Low cross post groups. lowcp = ['^alt\.free\.newsservers'] self.lowcp = self.regex_compile(lowcp) def regex_compile(self, regexlist): textual = '|'.join(regexlist).replace('||', '|') return re.compile(textual) class EMP: def __init__(self, threshold=3, ceiling=100, maxentries=5000, timedtrim=3600, dofuzzy=False, name=False): # Statistics relating to this EMP instance. if threshold > ceiling: raise ValueError('Threshold cannot exceed ceiling') # The hash table itself. Keyed by MD5 hash and containing a hit # count. self.table = {} # Attempt to restore a previous EMP dump. self.restore(name) self.fuzzy_15char = re.compile('\S{15,}') self.fuzzy_notletters = re.compile('[^a-zA-Z]') # Initialize some defaults. self.stats = {'name': name, 'nexttrim': future(secs=timedtrim), 'lasttrim': now(), 'processed': 0, 'accepted': 0, 'rejected': 0, 'threshold': threshold, 'ceiling': ceiling, 'maxentries': maxentries, 'timedtrim': timedtrim, 'dofuzzy': dofuzzy} logmes = '%(name)s initialized. ' logmes += 'threshold=%(threshold)s, ' logmes += 'ceiling=%(ceiling)s, ' logmes += 'maxentries=%(maxentries)s, ' logmes += 'timedtrim=%(timedtrim)s' logging.info(logmes % self.stats) def add(self, content): """The content, in this context, is any string we want to hash and check for EMP collisions. In various places we refer to it as 'hash fodder'. """ self.stats['processed'] += 1 if self.stats['dofuzzy']: # Strip long strings. content = re.sub(self.fuzzy_15char, '', content) # Remove everything except a-zA-Z. content = re.sub(self.fuzzy_notletters, '', content).lower() # Bail out if the byte length of the content isn't sufficient for # generating an effective, unique hash. if len(content) < 1: logging.debug("Null content in %s hashing fodder.", self.stats['name']) return False # See if it's time to perform a trim. n = now() if n > self.stats['nexttrim']: secs_since_lasttrim = (n - self.stats['lasttrim']).seconds decrement = int(secs_since_lasttrim / self.stats['timedtrim']) logmes = "%s: Trim decrement factor=%s" logging.debug("{}: Trim decrement factor={}".format(self.stats['name'], decrement)) if decrement > 0: self._trim(decrement) else: logmes = "%s: Invalid attempt to trim by less than 1" logging.error(logmes % (self.stats['name'], decrement)) elif len(self.table) > self.stats['maxentries']: logmes = '%(name)s: Exceeded maxentries of %(maxentries)s' logging.warn(logmes % self.stats) self._trim(1) # MD5 is weak in cryptographic terms, but do I care for the purpose # of EMP collision checking? Obviously not or I'd use something else. h = md5(encodedStr(content)).digest() if h in self.table: # When the ceiling is reached, stop incrementing the count. if self.table[h] < self.stats['ceiling']: self.table[h] += 1 else: logging.debug("%s hash ceiling hit. Not incrementing counter.", self.stats['name']) else: # Initialize the md5 entry. self.table[h] = 1 if self.table[h] > self.stats['threshold']: # Houston, we have an EMP reject. self.stats['rejected'] += 1 return True self.stats['accepted'] += 1 return False def _trim(self, decrement): """Decrement the counter against each hash. If the counter reaches zero, delete the hash entry. """ # As the EMP table is about to be modified, oldsize records it prior # to doing any changes. This is only used for reporting purposes. self.stats['oldsize'] = len(self.table) # Keep a running check of the largest count against a key. self.stats['high'] = 0 for h in list(self.table.keys()): self.table[h] -= decrement if self.table[h] > self.stats['high']: self.stats['high'] = self.table[h] if self.table[h] <= 0: del self.table[h] self.stats['size'] = len(self.table) self.stats['decrement'] = decrement logging.info("%(name)s: Trim complete. was=%(oldsize)s, now=%(size)s, " "high=%(high)s, decrement=%(decrement)s", self.stats) self.stats['nexttrim'] = \ future(secs=self.stats['timedtrim']) self.stats['lasttrim'] = now() def statlog(self): """Log details of the EMP hash.""" self.stats['size'] = len(self.table) logging.info("%(name)s: size=%(size)s, processed=%(processed)s, " "accepted=%(accepted)s, rejected=%(rejected)s", self.stats) def dump(self): """Dump the EMP table to disk so we can reload it after a restart. """ dumpfile = os.path.join(config.get('paths', 'lib'), self.stats['name'] + ".db") dump = shelve.open(dumpfile, flag='n') for k in self.table: dump[decodedStr(k)] = self.table[k] dump.close() def restore(self, name): """Restore an EMP dump from disk. """ dumpfile = os.path.join(config.get('paths', 'lib'), name + ".db") if os.path.isfile(dumpfile): logging.info("Attempting restore of %s dump", name) dump = shelve.open(dumpfile, flag='r') # We seem unable to use copy functions between shelves and dicts # so we do it per record. Speed is not essential at these times. for k in dump: self.table[k] = dump[k] dump.close() logging.info("Restored %s records to %s", len(self.table), name) else: logging.debug("%s: Dump file does not exist. Doing a clean " "initialization.", dumpfile) def reset(self): """Reset counters for this emp filter. """ self.stats['processed'] = 0 self.stats['accepted'] = 0 self.stats['rejected'] = 0 """ Okay, that's the end of our class definition. What follows is the stuff you need to do to get it all working inside innd. """ if 'python_filter' not in dir(): python_version = sys.version_info config = init_config() logfmt = config.get('logging', 'format') datefmt = config.get('logging', 'datefmt') loglevels = {'debug': logging.DEBUG, 'info': logging.INFO, 'warn': logging.WARN, 'error': logging.ERROR} logging.getLogger().setLevel(logging.DEBUG) logfile = logging.handlers.TimedRotatingFileHandler( os.path.join(config.get('paths', 'log'), 'pyclean.log'), when='midnight', interval=1, backupCount=config.getint('logging', 'retain')) logfile.setLevel(loglevels[config.get('logging', 'level')]) logfile.setFormatter(logging.Formatter(logfmt, datefmt=datefmt)) logging.getLogger().addHandler(logfile) python_filter = InndFilter() try: INN.set_filter_hook(python_filter) INN.syslog('n', "pyclean successfully hooked into INN") except Exception: # Syntax valid in both Python 2.x and 3.x. e = sys.exc_info()[1] INN.syslog('e', "Cannot obtain INN hook for pyclean: %s" % e.args[0]) # This looks weird, but creating and interning these strings should let us get # faster access to header keys (which innd also interns) by losing some strcmps # under the covers. Also_Control = intern("Also-Control") Approved = intern("Approved") Archive = intern("Archive") Archived_At = intern("Archived-At") Bytes = intern("Bytes") Cancel_Key = intern("Cancel-Key") Cancel_Lock = intern("Cancel-Lock") Comments = intern("Comments") Content_Base = intern("Content-Base") Content_Disposition = intern("Content-Disposition") Content_Transfer_Encoding = intern("Content-Transfer-Encoding") Content_Type = intern("Content-Type") Control = intern("Control") Date = intern("Date") Date_Received = intern("Date-Received") Distribution = intern("Distribution") Expires = intern("Expires") Face = intern("Face") Followup_To = intern("Followup-To") From = intern("From") In_Reply_To = intern("In-Reply-To") Injection_Date = intern("Injection-Date") Injection_Info = intern("Injection-Info") Jabber_ID = intern("Jabber-ID") Keywords = intern("Keywords") Lines = intern("Lines") List_ID = intern("List-ID") Message_ID = intern("Message-ID") MIME_Version = intern("MIME-Version") Newsgroups = intern("Newsgroups") NNTP_Posting_Date = intern("NNTP-Posting-Date") NNTP_Posting_Host = intern("NNTP-Posting-Host") NNTP_Posting_Path = intern("NNTP-Posting-Path") Organization = intern("Organization") Original_Sender = intern("Original-Sender") Originator = intern("Originator") Path = intern("Path") Posted = intern("Posted") Posting_Version = intern("Posting-Version") Received = intern("Received") References = intern("References") Relay_Version = intern("Relay-Version") Reply_To = intern("Reply-To") Sender = intern("Sender") Subject = intern("Subject") Summary = intern("Summary") Supersedes = intern("Supersedes") User_Agent = intern("User-Agent") X_Auth = intern("X-Auth") X_Auth_Sender = intern("X-Auth-Sender") X_Canceled_By = intern("X-Canceled-By") X_Cancelled_By = intern("X-Cancelled-By") X_Complaints_To = intern("X-Complaints-To") X_Face = intern("X-Face") X_HTTP_UserAgent = intern("X-HTTP-UserAgent") X_HTTP_Via = intern("X-HTTP-Via") X_Mailer = intern("X-Mailer") X_Modbot = intern("X-Modbot") X_Modtrace = intern("X-Modtrace") X_Newsposter = intern("X-Newsposter") X_Newsreader = intern("X-Newsreader") X_No_Archive = intern("X-No-Archive") X_Original_Message_ID = intern("X-Original-Message-ID") X_Original_NNTP_Posting_Host = intern("X-Original-NNTP-Posting-Host") X_Original_Trace = intern("X-Original-Trace") X_Originating_IP = intern("X-Originating-IP") X_PGP_Key = intern("X-PGP-Key") X_PGP_Sig = intern("X-PGP-Sig") X_Poster_Trace = intern("X-Poster-Trace") X_Postfilter = intern("X-Postfilter") X_Proxy_User = intern("X-Proxy-User") X_Submissions_To = intern("X-Submissions-To") X_Trace = intern("X-Trace") X_Usenet_Provider = intern("X-Usenet-Provider") X_User_ID = intern("X-User-ID") Xref = intern("Xref") __BODY__ = intern("__BODY__") __LINES__ = intern("__LINES__")