import imaplib from email.parser import BytesParser from pprint import pprint import email.header import time import json import csv # By Gerry Jenkins see my channel and subscribe (please) @ youtube.com/gjenkinslbcc # see blog article https://teklern.blogspot.com/2017/11/download-all-your-email-information.html for instructions # just run add you username and password and configure below for ouput_filename path, and output_type to json or csv # and run # Settings username = 'put your email username here i.e. user@gmail.com' password = 'password here' # path to ouput file name (leave off the extention) ouput_filename = '/Users/you/Desktop/all_email' # output of all data, note extension is on next variable output_type = 'json' # any other setitng will output csv to the extention imapAddress = 'imap.gmail.com' column_names = [ 'n', 'From','To','Subject','Date','Received', 'Rfc822msgid','Size','uid', 'Attachments', 'text/plain','text/html',] chunk = 1000 # number emails to ask for at each fetch imap server start = 0 # start from first message endAt = 50 # None # set to last number or None to get all email account def f_recieved(s): return { 'Rfc822msgid': f'Rfc822msgid:{s}'}; # just past this into gmail to find message # specify header parts to save and any conversion functions on them key_map = { 'From':None, 'To':None, 'Subject':None, 'Date':None, 'Received': None, 'Message-ID':f_recieved, } def parse_parts(msg, key_map): ''' return {key:msg[header_key]} or {parse_fun(msg[header_key]) as instructed in keymap''' parts = {} for hkey in key_map: raw = msg[hkey] if raw: if isinstance(raw, email.header.Header): raw = str(raw) # to fix non ascii parts f = key_map[hkey] if f: fparts = f(raw) for k in fparts: parts[k] = fparts[k] else: parts[hkey] = raw return parts def decode_part(part, mime_type): # decode a part from the correct char coding. This was tricky charset = part.get_content_charset() if part.get_content_type() == mime_type: part_str = part.get_payload(decode=1) if charset == None: # this is when the coding is not in the email data charset = 'utf-8' # assume utf-8 then try: return part_str.decode(charset, 'replace') # and try with replacement except: # on fail, ouput the message id in form that works with gmail find box print(f"** pos {pos} {parts['Rfc822msgid']}:Decode Error, {mime_type} part skipped") pprint(part_str) # and print what caused the error print('----------') return "" # no part if error return "" def decode_email(msg_str, pos, key_map): # process whole email parts and build email list/dict records filenames = None p = BytesParser() message = p.parsebytes(msg_str) # get header parts = parse_parts(message, key_map) # add header parts specified in key_map parts['Size'] = len(msg_str) plain_body = '' html_body = '' for part in message.walk(): plain_body += decode_part(part, 'text/plain') if len(plain_body) > 0: html_body = "" else: html_body += decode_part(part, 'text/html') fn = part.get_filename() if fn: if filenames == None: filenames = [] filenames.append(fn) if filenames: parts['Attachments'] = filenames if len(plain_body) > 0: parts['text/plain'] = plain_body elif len(html_body) > 0: parts['text/html'] = html_body return parts def store_json(file, recs): with open(file + '.json','w') as f: f.write(json.dumps(recs, sort_keys=True, indent=4)) def store_csv(file, recs): with open(file + '.csv', 'w') as f: dict_writer = csv.DictWriter(f, column_names) dict_writer.writeheader() dict_writer.writerows(recs) if __name__ == '__main__': t0 = time.time() ms = imaplib.IMAP4_SSL(imapAddress) # open imap session ms ms.login(username, password) if ms.state == "AUTH": print("logged in OK") else: print("login Failed") exit(1) ms.select('"[Gmail]/All Mail"') # select all mail folders, this is specific to gmail! # NOTE: the double quotes are part of the select result, data = ms.uid('search', None, 'ALL') # return 1 to number of all emails, list of uids uids = data[0].split() # parse into array n = len(uids) # get number of all emails if endAt: n = endAt # override nif endAt is set recs = [] for i in range(start,n,chunk): # fetch 250 emails each time srange = f'{i+1}:{min(i+chunk,n)}' resp,data = ms.fetch(srange , '(RFC822)') for id,msg in enumerate((m[1] for m in data if isinstance(m,tuple))): pos = i + id + 1 parts = decode_email(msg, pos, key_map) pos = i + id + 1 parts['uid'] = str(int(uids[pos - 1])) parts['n'] = pos recs.append(parts) try: json.dumps(parts) except: print(f"** pos {pos} {parts['Rfc822msgid']}:json dump fail") # catch some rare errors here pprint(parts) print('---------') t1 = time.time() elapsed_mins = (t1 - t0)/60 # mins from start print(f'@ {pos}/{n} {pos/n*100:.1f}% elapsed: {elapsed_mins:.2f} mins') ms.logout() if output_type == 'json': store_json(ouput_filename, recs) else: store_csv(ouput_filename, recs) print('*** DONE ***')