#! /usr/bin/env python
# Take an mbox HTML message (e.g. from mutt), split it
# and rewrite it so it can be viewed in an external browser.
# Can be run from within a mailer like mutt, or independently
# on a single message file.
#
# Usage: viewhtmlmail.py email_message_file
#
# Inspired by John Eikenberry 's view_html_mail.sh
# which sadly no longer works, at least with mail from current Apple Mail.
#
# Copyright 2013-2022 by Akkana Peck. Share and enjoy under the GPL v2 or later.
# Contributions:
# Holger Klawitter 2014: create a secure temp file and avoid temp mbox
# Antonio Terceiro 2018: Allow piping directly from mutt.
# To use it from mutt, install it somewhere in your path,
# then put the following lines in your .muttrc:
# macro index "~/bin/viewhtmlmail.py\n" "View HTML email in browser"
# macro pager "~/bin/viewhtmlmail,ot\n" "View HTML email in browser"
# TESTING: Use the email file in test/files/htmlmail.eml.
import os, sys
import re
import time
import shutil
import email, mimetypes
from email.parser import BytesParser
from email.policy import default as default_policy
import subprocess
from collections import OrderedDict # for python < 3.7
################################################
# Some prefs:
# Print lots of debugging info?
DEBUG = False
# If IMAGE_VIEWER is set, a message that has no multipart/related
# images will use the image viewer rather than a browser window
# for images. To use a browser, set IMAGE_VIEWER = None.
IMAGE_VIEWER = "pho"
# IMAGE_VIEWER = None
IMAGE_VIEWER_ARGS = ["-P"] # For pho, don't use presentation mode
USE_WVHTML_FOR_DOC = False
# How many seconds do we need to wait for unoconv?
# It defaults to 6, but on a 64-bit machine that's not nearly enough.
# Even 10 often isn't enough.
UNOCONV_STARTUP_TIME = "14"
# A list of supported browsers, in order of preference.
BROWSERS = OrderedDict([
('qutebrowser', {
'ARGS_FIRST': [ "--target", "private-window",
"--basedir", "/tmp/mailattachments",
"-s", "content.dns_prefetch", "false",
"-s", "content.javascript.enabled", "false" ],
# If using PDFJS, add: "-s", "content.javascript.enabled", "false"
'ARGS': [ "--target", "tab-bg",
"--basedir", "/tmp/mailattachments",
# Don't need to specify privacy, prefetch or JS
# because it's being opened in a window that
# already has those settings, using the same configdir.
],
'BACKGROUND': True,
# qutebrowser can display PDF natively if you use pdf.js.
# On debian, apt install libjs-pdf.
# But that also gives hundreds of lines of errors like
# ERROR: NotFoundError while handling qute://* URL: Can't find pdfjs resource 'web/images/toolbarButton-viewAttachments.svg'
# Also, it creates two tabs for each PDF file, reproducible with:
# qutebrowser --target private-window --basedir /tmp/mailattachments -s content.dns_prefetch false -s content.pdfjs true somefile.html
# qutebrowser --target tab-bg --basedir /tmp/mailattachments -s content.pdfjs true somefile.pdf
# so for now, disable it and convert to html instead:
'CONVERT_PDF_TO_HTML': True
} ),
('quickbrowse', {
'ARGS_FIRST': [],
'ARGS':[ "--new-tab"],
'BACKGROUND': False,
'CONVERT_PDF_TO_HTML': True
}),
('firefox', {
'ARGS_FIRST': [ "--new-tab", "--private-window" ],
'ARGS': [ "--private-window" ],
'BACKGROUND': True,
'CONVERT_PDF_TO_HTML': False,
})
])
WORKING_BROWSER = None
# First call to a browser?
first_browser = True
def run_browser(browser, htmlfile):
"""Call a specific browser with the appropriate arguments.
May raise various errors.
"""
cmd = [ browser ]
if first_browser:
cmd += BROWSERS[browser]['ARGS_FIRST']
else:
cmd += BROWSERS[browser]['ARGS']
cmd.append("file://" + htmlfile)
if DEBUG:
print("Calling in background: %s" % ' '.join(cmd))
mysubprocess.call_bg(cmd)
def call_some_browser(htmlfile):
"""Try the list of browsers to find which one works."""
global WORKING_BROWSER
global first_browser
errstr = ""
if DEBUG:
print("Calling browser for file://%s" % htmlfile)
if WORKING_BROWSER:
run_browser(WORKING_BROWSER, htmlfile)
return
for b in BROWSERS:
try:
run_browser(b, htmlfile)
# If it worked, break out of the loop
WORKING_BROWSER = b
first_browser = False
break
except Exception as e:
thiserr = "\n**** Couldn't run %s! %s" % (b, e)
errstr += thiserr
if DEBUG:
print(thiserr)
# traceback.print_exc()
continue
if not WORKING_BROWSER:
print("Couldn't use any known browser: bailing.")
print("Errors:", errstr)
print("Run with -d (debug) to see more details.")
sys.exit(1)
# Seconds to wait between refreshes when waiting for translated content
REDIRECT_TIMEOUT = 2
# End global prefs
################################################
def find_first_maildir_file(maildir):
"""Maildir: inside /tmp/mutttmpbox, mutt creates another level of
directory, so the file will be something like /tmp/mutttmpbox/cur/1.
So recurse into directories until we find an actual mail file.
Return a full path to the filename.
"""
for root, dirs, files in os.walk(maildir):
for f in files:
if not f.startswith('.'):
return os.path.join(root, f)
return None
MAX_FILENAME_LENGTH = 225
def sanitize_filename(badstr):
"""Sanitize a filename to make sure there's nothing dangerous, like ../
Also make sure it's under MAX_FILENAME_LENGTH.
"""
filename = ''.join([x for x in badstr if x.isalpha() or x.isdigit()
or x in '-_.'])
if len(filename) > MAX_FILENAME_LENGTH:
half = MAX_FILENAME_LENGTH // 2
filename = filename[:half] + filename[-half:]
return filename
def view_html_message(f, tmpdir):
# Note: the obvious way to read a message is
# with open(f) as fp: msg = email.message_from_file(fp)
# What the docs don't tell you is that that gives you an
# email.message.Message, which is limited and poorly documented;
# all the documentation assumes you have an email.message.EmailMessage,
# but to get that you need the more complicated BytesParser method below.
# The policy argument to BytesParser is mandatory: without it,
# again, you'll get a Message and not an EmailMessage.
if f:
if os.path.isdir(f):
# Maildir: f is a maildir like /tmp/mutttmpbox,
# and inside it, for some reason, mutt creates another
# level of directory named either cur or new
# depending on whether the message is already marked read.
# So we have to open the first file inside either cur or new.
# In case mutt changes this behavior, let's take the first
# non-dotfile inside the first non-dot directory.
msg = None
for maildir in os.listdir(f):
with open(find_first_maildir_file(f), 'rb') as fp:
msg = BytesParser(policy=default_policy).parse(fp)
break
else:
# Mbox format: we assume there's only one message in the mbox.
with open(f, 'rb') as fp:
# msg = email.message_from_string(fp.read())
msg = BytesParser(policy=default_policy).parse(fp)
else:
msg = BytesParser(policy=default_policy).parsebytes(sys.stdin.buffer.read())
counter = 1
filename = None
filenames = set()
subfiles = {} # A dictionary mapping content-id to [filename, part]
html_parts = []
# For debugging:
def print_part(part):
print("*** part:") # parts are type email.message.Message
print(" content-type:", part.get_content_type())
print(" content-disposition:", part.get_content_disposition())
print(" content-id:", part.get('Content-ID'))
print(" filename:", part.get_filename())
print(" is_multipart?", part.is_multipart())
def print_structure(msg, indent=0):
"""Iterate over an EmailMessage, printing its structure"""
indentstr = ' ' * indent
for part in msg.iter_parts():
print("%scontent-type:" % indentstr, part.get_content_type())
print(" content-subtype:", part.get_content_subtype())
print(" content-id:", part.get('Content-ID'))
print("%scontent-disposition:" % indentstr,
part.get_content_disposition())
print("%sfilename:" % indentstr, part.get_filename())
print("%sis_multipart?" % indentstr, part.is_multipart())
print_structure(part, indent=indent+2)
print()
if DEBUG:
print_structure(msg)
for part in msg.walk():
if DEBUG:
print()
print_part(part)
# multipart/* are just containers
#if part.get_content_maintype() == 'multipart':
if part.is_multipart() or part.get_content_type == 'message/rfc822':
continue
# Get the content id.
# Mailers may use Content-Id or Content-ID (or, presumably, various
# other capitalizations). So we can't just look it up simply.
content_id = None
for k in list(part.keys()):
if k.lower() == 'content-id':
# Remove angle brackets, if present.
# part['Content-Id'] is unmutable -- attempts to change it
# are just ignored -- so copy it to a local mutable string.
content_id = part[k]
if content_id.startswith('<') and content_id.endswith('>'):
content_id = content_id[1:-1]
counter += 1
break # no need to look at other keys
if part.get_content_subtype() == 'html':
if DEBUG:
print("Found an html part")
if html_parts:
print("Eek, more than one html part!")
html_parts.append(part)
elif not content_id:
if DEBUG:
print("No Content-Id")
pass
# Use the filename provided if possible, otherwise make one up.
filename = part.get_filename()
if filename:
filename = sanitize_filename(filename)
else:
# if DEBUG:
# print("No filename; making one up")
ext = mimetypes.guess_extension(part.get_content_type())
if not ext:
# Use a generic bag-of-bits extension
ext = '.bin'
if content_id:
filename = sanitize_filename('cid%s%s' % (content_id, ext))
else:
filename = 'part-%03d%s' % (counter, ext)
# Some mailers, like gmail, will attach multiple images to
# the same email all with the same filename, like "image.png".
# So check whether we have to uniquify the names.
if filename in filenames:
orig_basename, orig_ext = os.path.splitext(filename)
dedup_counter = 0
while filename in filenames:
dedup_counter += 1
filename = "%s-%d%s" % (orig_basename, dedup_counter, orig_ext)
filenames.add(filename)
# If there's no content_id, use the uniquified filename, sans path.
if not content_id:
content_id = filename
filename = os.path.join(tmpdir, filename)
# Now save content to the filename, and remember it in subfiles.
subfiles[content_id] = [ filename, part ]
with open(filename, 'wb') as fp:
fp.write(part.get_payload(decode=True))
if DEBUG:
print("wrote", filename)
# print "%10s %5s %s" % (part.get_content_type(), ext, filename)
if DEBUG:
print("\nsubfiles now:", subfiles)
print()
# We're done saving the parts. It's time to save the HTML part(s),
# with img tags rewritten to refer to the files we just saved.
embedded_parts = []
for i, html_part in enumerate(html_parts):
htmlfile = os.path.join(tmpdir, "viewhtml%02d.html" % i)
fp = open(htmlfile, 'wb')
# html_parts[i].get_payload() returns string, but it's apparently
# in straight unicode and doesn't reflect the message's charset.
# html_part.get_payload(decode=True) returns bytes,
# which (I think) have been decoded as far as email transfer
# (e.g. Content-Encoding: base64), which is not the same thing
# as charset decoding.
# (None of this is documented in the python3 email module;
# there's no mention of get_payload() at all. Sigh.)
htmlsrc = html_part.get_payload(decode=True)
# Substitute all the filenames for content_ids:
for sf_cid in subfiles:
# Yes, yes, I know:
# https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/
# and this should be changed to use BeautifulSoup.
if DEBUG:
print("Replacing cid", sf_cid, "with", subfiles[sf_cid][0])
newhtmlsrc = re.sub(b'cid: ?' + sf_cid.encode(),
b'file://' + subfiles[sf_cid][0].encode(),
htmlsrc, flags=re.IGNORECASE)
if sf_cid not in embedded_parts and newhtmlsrc != htmlsrc:
embedded_parts.append(sf_cid)
htmlsrc = newhtmlsrc
fp.write(htmlsrc)
fp.close()
if DEBUG:
print("Wrote", htmlfile)
# Now we have the file. Call a browser on it.
call_some_browser(htmlfile)
# Done with htmlparts.
# Now handle any parts that aren't embedded inside HTML parts.
# This includes conversions from Word or PDF, but also image attachments.
if DEBUG:
print()
print("subfiles:", subfiles)
print("Parts already embedded:", embedded_parts)
print("\n************************************\n")
image_files = []
for sfid in subfiles:
if DEBUG:
print("\nPart:", subfiles[sfid][0])
part = subfiles[sfid][1]
partfile = subfiles[sfid][0] # full path
fileparts = os.path.splitext(partfile)
if sfid in embedded_parts:
if DEBUG:
print(partfile, "was already embedded in html")
continue
if part.get_content_maintype() == "image":
image_files.append(partfile)
continue
if part.get_content_maintype() == "application":
htmlfilename = fileparts[0] + ".html"
subtype = part.get_content_subtype()
if DEBUG:
print("Application subtype:", subtype)
is_word = ("msword" in subtype or "ms-word" in subtype)
if is_word and USE_WVHTML_FOR_DOC:
mysubprocess.call(["wvHtml", partfile, htmlfilename])
call_some_browser(htmlfilename)
continue
# Unfortunately, unoconv can't convert excel files:
# it hangs forever trying.
if (is_word or subtype ==
"vnd.openxmlformats-officedocument.wordprocessingml.document"
or subtype == "vnd.oasis.opendocument.text"):
mysubprocess.call(["unoconv", "-f", "html",
"-T", UNOCONV_STARTUP_TIME,
"-o", htmlfilename, partfile])
call_some_browser(htmlfilename)
continue
# unoconv conversions from powerpoint to HTML drop all images.
# Try converting to PDF instead:
if part.get_content_subtype() == "vnd.ms-powerpoint" \
or part.get_content_subtype() == \
"vnd.openxmlformats-officedocument.presentationml.presentation":
pdffile = fileparts[0] + ".pdf"
mysubprocess.call(["unoconv", "-f", "pdf",
"-o", pdffile, partfile])
partfile = pdffile
if part.get_content_subtype() == "pdf" or partfile.endswith("pdf"):
if WORKING_BROWSER and \
BROWSERS[WORKING_BROWSER]['CONVERT_PDF_TO_HTML']:
print("Calling pdftohtml and delaying browser")
mysubprocess.call(["pdftohtml", "-s", partfile])
print("pdftohtml exited. Did it work?")
fff = fileparts[0] + "-html.html"
os.system("ls -l " + fff)
# But pdftohtml is idiotic about output filename
# and won't let you override it:
call_some_browser(fileparts[0] + "-html.html")
else:
call_some_browser(partfile)
if image_files:
if IMAGE_VIEWER:
if DEBUG:
print("Calling", IMAGE_VIEWER, "on", image_files)
cmd = [ IMAGE_VIEWER ] + IMAGE_VIEWER_ARGS + image_files
mysubprocess.call_bg(cmd)
else:
for img in image_files:
call_some_browser(img)
# For debugging:
class mysubprocess:
@staticmethod
def call(arr):
if DEBUG:
print("\n========= Calling: %s" % str(arr))
subprocess.call(arr)
@staticmethod
def call_bg(arr):
if DEBUG:
print("\n========= Calling in background: %s"
% str(arr))
subprocess.Popen(arr, shell=False,
stdin=None, stdout=None, stderr=None)
if __name__ == '__main__':
import tempfile
tmpdir = tempfile.mkdtemp()
if len(sys.argv) > 1:
for f in sys.argv[1:]:
if f == '-d':
DEBUG = True
continue
view_html_message(f, tmpdir)
else:
view_html_message(None, tmpdir)