#Licensed to the Apache Software Foundation (ASF) under one #or more contributor license agreements. See the NOTICE file #distributed with this work for additional information #regarding copyright ownership. The ASF licenses this file #to you under the Apache License, Version 2.0 (the #"License"); you may not use this file except in compliance #with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # #Unless required by applicable law or agreed to in writing, #software distributed under the License is distributed on an #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY #KIND, either express or implied. See the License for the #specific language governing permissions and limitations #under the License. import re import string def parse(email_text, remove_quoted_statements=True): email_text = email_text.strip() email_text = strip_automated_notation(email_text) if remove_quoted_statements: pattern = """(?P".*?")""" matches = re.findall(pattern, email_text, re.IGNORECASE + re.DOTALL) for m in matches: email_text = email_text.replace(m, '"[quote]"') result = { \ "salutation":get_salutation(email_text), \ "body":get_body(email_text), \ "signature":get_signature(email_text), \ "reply_text":get_reply_text(email_text) \ } return result #automated_notation could be any labels or sections in the email giving special notation for #human readers of the email text. For example, email_text may start with "A message from your customer:" def strip_automated_notation(email_text): #Use a paramater name email_text to indicate text in the actual email message notations = [ "Hi, there has been a new enquiry from\..*?Enquiry:(?P.*)", ] for n in notations: groups = re.match(n, email_text, re.IGNORECASE + re.DOTALL) if not groups is None: if groups.groupdict().has_key("email_message"): email_text = groups.groupdict()["email_message"] return email_text def get_reply_text(email_text): #Notes on regex #Search for classic prefix from GMail and other mail clients "On May 16, 2011, Dave wrote:" #Search for prefix from outlook clients From: Some Person [some.person@domain.tld] #Search for prefix from outlook clients when used for sending to recipients in the same domain From: Some Person\nSent: 16/05/2011 22:42\nTo: Some Other Person #Search for prefix when message has been forwarded #Search for From: \nTo: \nDate:@\.\"\[\]]* wrote:.*)|" + \ "From: [\w@ \.]* \[mailto:[\w\.]*@[\w\.]*\].*|" + \ "From: [\w@ \.]*(\n|\r\n)+Sent: [\*\w@ \.,:/]*(\n|\r\n)+To:.*(\n|\r\n)+.*|" + \ "[- ]*Forwarded by [\w@ \.,:/]*.*|" + \ "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Date: [\w@ \.<>\-:,]*\n.*|" + \ "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Sent: [\*\w@ \.,:/]*(\n|\r\n).*|" + \ "From: [\w@ \.<>\-]*(\n|\r\n)To: [\w@ \.<>\-]*(\n|\r\n)Subject:.*|" + \ "(-| )*Original Message(-| )*.*)" groups = re.search(pattern, email_text, re.IGNORECASE + re.DOTALL) reply_text = None if not groups is None: if groups.groupdict().has_key("reply_text"): reply_text = groups.groupdict()["reply_text"] return reply_text def get_signature(email_text): #try not to have the signature be the very start of the message if we can avoid it salutation = get_salutation(email_text) if salutation: email_text = email_text[len(salutation):] #note - these openinged statements *must* be in lower case for #sig within sig searching to work later in this func sig_opening_statements = [ "warm regards", "kind regards", "regards", "cheers", "many thanks", "thanks", "sincerely", "ciao", "Best", "bGIF", "thank you", "thankyou", "talk soon", "cordially", "yours truly", "thanking You", "sent from my iphone"] pattern = "(?P(" + string.joinfields(sig_opening_statements, "|") + ")(.)*)" groups = re.search(pattern, email_text, re.IGNORECASE + re.DOTALL) signature = None if groups: if groups.groupdict().has_key("signature"): signature = groups.groupdict()["signature"] reply_text = get_reply_text(email_text[email_text.find(signature):]) if reply_text: signature = signature.replace(reply_text, "") #search for a sig within current sig to lessen chance of accidentally stealing words from body tmp_sig = signature for s in sig_opening_statements: if tmp_sig.lower().find(s) == 0: tmp_sig = tmp_sig[len(s):] groups = re.search(pattern, tmp_sig, re.IGNORECASE + re.DOTALL) if groups: signature = groups.groupdict()["signature"] #if no standard formatting has been provided (e.g. Regards, ), #try a probabilistic approach by looking for phone numbers, names etc. to derive sig if not signature: #body_without_sig = get_body(email_text, check_signature=False) pass #check to see if the entire body of the message has been 'stolen' by the signature. If so, return no sig so body can have it. body_without_sig = get_body(email_text, check_signature=False) if signature==body_without_sig: signature = None return signature #todo: complete this out (I bit off a bit more than I could chew with this function. Will probably take a bunch of basian stuff def is_word_likely_in_signature(word, text_before="", text_after=""): #Does it look like a phone number? #is it capitalized? if word[:1] in string.ascii_uppercase and word[1:2] in string.ascii_lowercase: return True return #check_ args provided so that other functions can call get_body without causing infinite recursion def get_body(email_text, check_salutation=True, check_signature=True, check_reply_text=True): if check_salutation: sal = get_salutation(email_text) if sal: email_text = email_text[len(sal):] if check_signature: sig = get_signature(email_text) if sig: email_text = email_text[:email_text.find(sig)] if check_reply_text: reply_text = get_reply_text(email_text) if reply_text: email_text = email_text[:email_text.find(reply_text)] return email_text def get_salutation(email_text): #remove reply text fist (e.g. Thanks\nFrom: email@domain.tld causes salutation to consume start of reply_text reply_text = get_reply_text(email_text) if reply_text: email_text = email_text[:email_text.find(reply_text)] #Notes on regex: #Max of 5 words succeeding first Hi/To etc, otherwise is probably an entire sentence salutation_opening_statements = [ "hi", "dear", "to", "hey", "hello", "thanks", "good morning", "good afternoon", "good evening", "thankyou", "thank you"] pattern = "\s*(?P(" + string.joinfields(salutation_opening_statements, "|") + ")+(\s*\w*)(\s*\w*)(\s*\w*)(\s*\w*)(\s*\w*)[\.,\xe2:]+\s*)" groups = re.match(pattern, email_text, re.IGNORECASE) salutation = None if not groups is None: if groups.groupdict().has_key("salutation"): salutation = groups.groupdict()["salutation"] return salutation