{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gmail Experiment: Data Preprocessing\n",
    "Author: Leon Yin and Surya Mattu<br>\n",
    "Links: [GitHub](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb) | [nbviewer](https://nbviewer.jupyter.org/github/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb) | [Story](https://themarkup.org/google-the-giant/2020/02/26/wheres-my-email) <br>\n",
    "\n",
    "This notebook describes the data preprocessing sets for our Gmail promotions experiment. Documentation for this dataset can be found in Github [here](https://github.com/the-markup/investigation-wheres-my-email#data), read more about the context of why and how we collected this dataset in our [paper](https://themarkup.org/google-the-giant/2020/02/26/show-your-work-wheres-my-email)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import glob\n",
    "import gzip\n",
    "\n",
    "import mailbox\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
    "sys.path.append('..')\n",
    "from utils.preprocessing import (\n",
    "    find_email,\n",
    "    find_name,\n",
    "    find_tld_from_email,\n",
    "    is_primary,\n",
    "    get_email_id,\n",
    "    get_text,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "gzip_directory = '../data/input/google_takeout_mboxes_gzipped/'\n",
    "fn_newsletter_categories = '../data/input/newsletter_categories.csv'\n",
    "\n",
    "# outputs\n",
    "mbox_directory = '../data/input/google_takeout_mboxes/'\n",
    "fn_output = '../data/intermediates/email_metadata_merged_with_newsletters.csv.gz'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decompressing mbox files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6/6 [00:00<00:00, 14952.96it/s]\n"
     ]
    }
   ],
   "source": [
    "# make a directory for the uncompressed mbox files. \n",
    "os.makedirs(mbox_directory, exist_ok=True)\n",
    "\n",
    "# Let's identify the gzipped mbox files\n",
    "mboxes_gzipped = glob.glob(os.path.join(gzip_directory, '*.mbox.gz'))\n",
    "\n",
    "# un-gzip each, and write them to a new destination\n",
    "for fn in tqdm(mboxes_gzipped):\n",
    "    fn_out = fn.replace(gzip_directory, \n",
    "                        mbox_directory).replace('.gz', '')\n",
    "    # un-gzip if does not exist\n",
    "    if os.path.exists(fn_out):\n",
    "        continue \n",
    "    # write to a non-gzipped file.\n",
    "    with gzip.open(fn, 'rt') as f_:\n",
    "        with open(fn_out, 'w') as f:\n",
    "            for line in f_:\n",
    "                f.write(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extracting fields and preprocessing email metadata\n",
    "If you want more information about the available headers, they are described well over [here](https://web.archive.org/save/https://www.makeuseof.com/tag/how-to-trace-your-emails-back-to-the-source/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Here let's identify the input files we're working with\n",
    "mboxes = glob.glob(os.path.join(mbox_directory, '*.mbox'))\n",
    "len(mboxes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what we'll be keeping from each mbox record\n",
    "keep_from_mbox = [\n",
    "    'Date', \n",
    "    'X-Gmail-Labels', \n",
    "    'X-GM-THRID', \n",
    "    'To', \n",
    "    'From', \n",
    "    'Delivered-To', \n",
    "    'Received', \n",
    "    \"Subject\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2929it [00:24, 120.42it/s]\n",
      "1934it [00:16, 119.86it/s]\n",
      "2473it [00:20, 119.36it/s]\n",
      "448it [00:03, 127.72it/s]\n",
      "3702it [00:31, 117.17it/s]\n",
      "3387it [00:27, 121.04it/s]\n"
     ]
    }
   ],
   "source": [
    "message_list = []\n",
    "for fn in mboxes:\n",
    "    mb = mailbox.mbox(fn)\n",
    "    for _message in tqdm(mb.itervalues()):\n",
    "        # turn the _messages into a dictionary\n",
    "        email_metadata = dict(_message.items())\n",
    "        \n",
    "        # filter out K-V pairs we're not interested in\n",
    "        email_metadata = {\n",
    "            k : v for k, v in email_metadata.items() \n",
    "            if k in keep_from_mbox\n",
    "        }\n",
    "        \n",
    "        # parse text fields, and append to email metadata\n",
    "        text_meta = get_text(_message)\n",
    "        email_metadata = {**email_metadata, **text_meta}\n",
    "        message_list.append(email_metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14873"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(message_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5417"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the dataframe from the message list and convert cols to datetime\n",
    "messages = pd.DataFrame(message_list)\n",
    "messages.Date = pd.to_datetime(messages.Date, utc=True)\n",
    "messages.loc[:, 'markup_id'] = messages.apply(get_email_id, axis=1)\n",
    "messages.drop_duplicates('markup_id', inplace=True)\n",
    "\n",
    "# this is the start of our experiment\n",
    "messages = messages[messages['Date'] > '2019-10-16']\n",
    "messages.reset_index(drop=True, inplace=True)\n",
    "len(messages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert the labels to binary columns and merge back in\n",
    "vect = MultiLabelBinarizer()\n",
    "X = vect.fit_transform(messages[\"X-Gmail-Labels\"].str.split(','))\n",
    "gmail_category_matrix = pd.DataFrame(X, columns=vect.classes_)\n",
    "messages = messages.join(gmail_category_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We determed which emails are in the primary inbox by looking at the `X-Gmail-Labels` in `gmail_category_matrix`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mSignature:\u001b[0m \u001b[0mis_primary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mSource:\u001b[0m   \n",
       "\u001b[0;32mdef\u001b[0m \u001b[0mis_primary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;34m\"\"\"\u001b[0m\n",
       "\u001b[0;34m    Determines if a row is in the Primary inbox. \u001b[0m\n",
       "\u001b[0;34m    These are emails that are in the Inbox, \u001b[0m\n",
       "\u001b[0;34m    but aren't in Spam, Promotions, or Trash. \u001b[0m\n",
       "\u001b[0;34m    \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m        \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m           \u001b[0;32mnot\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Spam'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m           \u001b[0;32mnot\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Category Promotions'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m           \u001b[0;32mnot\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Trash'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m           \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Inbox'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m        \u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m            \u001b[0;32mreturn\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m        \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m        \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;32mreturn\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mFile:\u001b[0m      ~/code/production-grade/promotions/utils/preprocessing.py\n",
       "\u001b[0;31mType:\u001b[0m      function\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "??is_primary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages.loc[:, \"Primary\"] = messages.apply(is_primary, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean up email headers\n",
    "messages = messages[~messages['From'].isnull()]\n",
    "messages.loc[:, \"From_Email\"] = messages['From'].apply(find_email)\n",
    "messages.loc[:, \"From_Domain\"] = messages[\"From_Email\"].apply(find_tld_from_email)\n",
    "messages.loc[:, \"From_Name\"] = messages[\"From\"].apply(find_name)\n",
    "messages.loc[:, \"To_Email\"] = messages['To'].apply(find_email).str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# late presidential candidates we added on 2/4. No email aliases, so we add them.\n",
    "messages.loc[messages['From_Domain'].str.contains('bloomberg'),\n",
    "            'To_Email'] = 'ajcorrigan88+mbloomberg@gmail.com'\n",
    "\n",
    "messages.loc[messages['From_Domain'].str.contains('steyer'),\n",
    "            'To_Email'] = 'ajcorrigan88+tsteyer@gmail.com'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging Metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "231"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# newsletters we signed up for and which alias we used.\n",
    "newsletter_categories = pd.read_csv(fn_newsletter_categories)\n",
    "len(newsletter_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Advocacy organization or think tank    73\n",
       "House battleground campaign            71\n",
       "House battleground official            70\n",
       "Presidential candidate                 17\n",
       "Name: Category, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# breakdown of entities we signed up for\n",
    "newsletter_categories.Category.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "X-GM-THRID                                           1654372657559145454\n",
       "X-Gmail-Labels                           Spam,Category Promotions,Unread\n",
       "Delivered-To                               ajcorrigan88+jwalsh@gmail.com\n",
       "Received               from nationbuilder.com (unknown [52.21.208.178...\n",
       "Date                                           2019-12-30 19:05:11+00:00\n",
       "From                                        Joe Walsh <joe@joewalsh.org>\n",
       "Subject                                                What's he hiding?\n",
       "To                                         ajcorrigan88+jwalsh@gmail.com\n",
       "text                   Hi Friend,\\nIt was hard to miss Donald Trump’s...\n",
       "script                                                                []\n",
       "style                                                               None\n",
       "markup_id                               d07816a54b6b0aa65faf4bcf0c142f2e\n",
       "Archived                                                               0\n",
       "Category Personal                                                      0\n",
       "Category Promotions                                                    1\n",
       "Category Updates                                                       0\n",
       "Important                                                              0\n",
       "Inbox                                                                  0\n",
       "Opened                                                                 0\n",
       "Spam                                                                   1\n",
       "Unread                                                                 1\n",
       "Primary                                                                0\n",
       "From_Email                                              joe@joewalsh.org\n",
       "From_Domain                                                 joewalsh.org\n",
       "From_Name                                                      Joe Walsh\n",
       "To_Email                                   ajcorrigan88+jwalsh@gmail.com\n",
       "Name                                                           Joe Walsh\n",
       "Website                                         https://www.joewalsh.org\n",
       "Email                                      ajcorrigan88+jwalsh@gmail.com\n",
       "Category                                          Presidential candidate\n",
       "Entity_ID                               3ee34b65e61fb39b1c6bf106224eb388\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = messages.merge(newsletter_categories,  how='left',\n",
    "                    left_on='To_Email', right_on='Email')\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create boolean index to filter each of these categories\n",
    "official = newsletter_categories['Category'] == 'House battleground official'\n",
    "campaign = newsletter_categories['Category'] == 'House battleground campaign'\n",
    "\n",
    "# to update the Website column\n",
    "name2official_site = dict(zip(\n",
    "    newsletter_categories[official].Name, \n",
    "    newsletter_categories[official].Website\n",
    "))\n",
    "\n",
    "name2campaign_site = dict(zip(\n",
    "    newsletter_categories[campaign].Name,\n",
    "    newsletter_categories[campaign].Website\n",
    "))\n",
    "\n",
    "\n",
    "# to update the Entity_ID columm\n",
    "name2official_entity_id= dict(zip(\n",
    "    newsletter_categories[official].Name, \n",
    "    newsletter_categories[official].Entity_ID\n",
    "))\n",
    "\n",
    "name2campaign_entity_id= dict(zip(\n",
    "    newsletter_categories[campaign].Name,\n",
    "    newsletter_categories[campaign].Entity_ID\n",
    "))\n",
    "\n",
    "del official, campaign"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what one of these dictionaries looks like:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'http://annwagner.com/'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name2campaign_site['Ann Wagner']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'fb98f8fa9305ff53ef495e5777d9837f'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name2campaign_entity_id['Ann Wagner']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what subset of the data are we re-assigning?\n",
    "df.loc[(df['From_Domain'].str.contains('.gov')) &\n",
    "       (df['Category'].str.contains('House battleground')), \n",
    "       # what column is being re-assigned?\n",
    "       'Category'] = 'House battleground official'\n",
    "\n",
    "df.loc[(~df['From_Domain'].str.contains('.gov')) &\n",
    "       (df['Category'].str.contains('House battleground')), \n",
    "       'Category'] = 'House battleground campaign'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create boolean index to filter each of these categories\n",
    "campaign = df.Category == 'House battleground campaign'\n",
    "official = df.Category == 'House battleground official'\n",
    "\n",
    "# update Website column\n",
    "df.loc[campaign, \n",
    "       'Website'] = df.loc[campaign].Name.replace(name2campaign_site)\n",
    "df.loc[official, \n",
    "       'Website'] = df.loc[official].Name.replace(name2official_site)\n",
    "\n",
    "# update Entity_ID\n",
    "df.loc[campaign, \n",
    "       'Entity_ID'] = df.loc[campaign].Name.replace(name2campaign_entity_id)\n",
    "df.loc[official, \n",
    "       'Entity_ID'] = df.loc[official].Name.replace(name2official_entity_id)\n",
    "\n",
    "del campaign, official"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop_duplicates(subset='markup_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's do a quick spot check for one of the House members:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "info@annwagner.com          32\n",
       "mo02awima@mail.house.gov    17\n",
       "listserv@ls1.house.gov       1\n",
       "Name: From_Email, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check against the original data\n",
    "email_alias = 'ajcorrigan88+awagneroffice@gmail.com'\n",
    "messages[messages.To_Email == email_alias].From_Email.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "House battleground campaign    32\n",
       "House battleground official    18\n",
       "Name: Category, dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Which sender category did we reassign?\n",
    "df[df.Name == 'Ann Wagner'].Category.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "fb98f8fa9305ff53ef495e5777d9837f    32\n",
       "0c5cea595602af6e7b7705e77d393d59    18\n",
       "Name: Entity_ID, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the entity ID also\n",
    "df[df.Name == 'Ann Wagner'].Entity_ID.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Website</th>\n",
       "      <th>Email</th>\n",
       "      <th>Category</th>\n",
       "      <th>Entity_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Ann Wagner</td>\n",
       "      <td>http://annwagner.com/</td>\n",
       "      <td>ajcorrigan88+awagneroffice@gmail.com</td>\n",
       "      <td>House battleground campaign</td>\n",
       "      <td>fb98f8fa9305ff53ef495e5777d9837f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Name                Website                                 Email  \\\n",
       "30  Ann Wagner  http://annwagner.com/  ajcorrigan88+awagneroffice@gmail.com   \n",
       "\n",
       "                       Category                         Entity_ID  \n",
       "30  House battleground campaign  fb98f8fa9305ff53ef495e5777d9837f  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eID = 'fb98f8fa9305ff53ef495e5777d9837f'\n",
    "newsletter_categories[newsletter_categories.Entity_ID == eID]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering out ambiguous emails and Google"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "44"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# how many emails from Gmail?\n",
    "len(df[df.From_Domain == 'google.com'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "235"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# filter out emails from senders who didn't send us an email with an alias\n",
    "len(df[(df.Name.isnull()) & (df.From_Domain != 'google.com')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "frc.org                   72\n",
       "google.com                44\n",
       "govdelivery.com           35\n",
       "frcaction.org             31\n",
       "congressnewsletter.net    23\n",
       "crp.org                   22\n",
       "brianfitzpatrick.com      13\n",
       "fotf.com                  12\n",
       "virginiazoo.org           10\n",
       "teapartypatriots.org       4\n",
       "aarp.org                   4\n",
       "house.gov                  3\n",
       "patriotsforperry.com       1\n",
       "girlscouts.org             1\n",
       "nrahq.org                  1\n",
       "vn.va                      1\n",
       "saymine.com                1\n",
       "youtube.com                1\n",
       "Name: From_Domain, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# emails from senders we're removing\n",
    "df[df.Name.isnull()].From_Domain.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5134"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[~df.Name.isnull()]\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "X-GM-THRID                                           1654372657559145454\n",
       "X-Gmail-Labels                           Spam,Category Promotions,Unread\n",
       "Date                                           2019-12-30 19:05:11+00:00\n",
       "Subject                                                What's he hiding?\n",
       "text                   Hi Friend,\\nIt was hard to miss Donald Trump’s...\n",
       "markup_id                               d07816a54b6b0aa65faf4bcf0c142f2e\n",
       "Category Personal                                                      0\n",
       "Category Promotions                                                    1\n",
       "Category Updates                                                       0\n",
       "Inbox                                                                  0\n",
       "Spam                                                                   1\n",
       "Primary                                                                0\n",
       "From_Email                                              joe@joewalsh.org\n",
       "From_Domain                                                 joewalsh.org\n",
       "From_Name                                                      Joe Walsh\n",
       "To_Email                                   ajcorrigan88+jwalsh@gmail.com\n",
       "Name                                                           Joe Walsh\n",
       "Website                                         https://www.joewalsh.org\n",
       "Category                                          Presidential candidate\n",
       "Entity_ID                               3ee34b65e61fb39b1c6bf106224eb388\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove these columns\n",
    "drop_cols = [\n",
    "    'Email',\n",
    "    'script',\n",
    "    'style',\n",
    "    'Received',\n",
    "    'Delivered-To',\n",
    "    'To', 'From', \n",
    "    'Unread',\n",
    "    'Opened',\n",
    "    'Archived',\n",
    "    'Important',    \n",
    "]\n",
    "\n",
    "df = df[[c for c in df.columns if c not in drop_cols]]\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5134"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's save the dataset as a CSV.\n",
    "df.to_csv(fn_output, index=False, compression='gzip')\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Summary statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summary(messages):\n",
    "    n_all = len(messages)\n",
    "    n_ppro = len(messages[(messages['Category Promotions']) ==1])\n",
    "    n_pro = len(messages[(messages['Category Promotions'] == 1) & \n",
    "                         (messages['Spam'] != 1) ] )\n",
    "    n_u = len(messages[messages['Category Updates'] == 1])\n",
    "    n_s = len(messages[messages['Spam'] == 1])\n",
    "    n_pri = len(messages[messages['Primary'] == 1])\n",
    "    first_email = messages.Date.min()\n",
    "    last_email = messages.Date.max()\n",
    "    senders = messages['From_Email'].nunique()\n",
    "    alias = messages['To_Email'].nunique()\n",
    "    \n",
    "    print(f\"There are a total of {n_all} emails\\n\"\n",
    "          f\"Sent between {first_email.strftime('%Y-%m-%d')} and {last_email.strftime('%Y-%m-%d %H:%M')}\\n\"\n",
    "          f\"From {senders} email addresses to {alias} alias\\n\\n\"\n",
    "          \"Categories:\\n\"\n",
    "          f\"{n_pri} emails or {n_pri / n_all * 100:.2f}% are in the primary inbox\\n\"\n",
    "          f\"{n_pro} emails or {n_pro / n_all * 100:.2f}% are in the promotions tab\\n\"\n",
    "          f\"{n_ppro} emails or {n_ppro / n_all * 100:.2f}% are promotions (includes spam)\\n\"\n",
    "          f\"{n_u} emails or {n_u / n_all * 100:.2f}% are in the updates tab (includes spam)\\n\"\n",
    "          f\"{n_s} emails or {n_s / n_all * 100:.2f}% are in spam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are a total of 5134 emails\n",
      "Sent between 2019-10-16 and 2020-02-12 19:17\n",
      "From 229 email addresses to 136 alias\n",
      "\n",
      "Categories:\n",
      "546 emails or 10.63% are in the primary inbox\n",
      "2551 emails or 49.69% are in the promotions tab\n",
      "3961 emails or 77.15% are promotions (includes spam)\n",
      "1157 emails or 22.54% are in the updates tab (includes spam)\n",
      "2037 emails or 39.68% are in spam\n"
     ]
    }
   ],
   "source": [
    "summary(df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}