{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tweet summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare the tweet data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_001.json.gz\n",
      "DEBUG:root:Loaded 50000\n",
      "DEBUG:root:Loaded 100000\n",
      "DEBUG:root:Loaded 150000\n",
      "DEBUG:root:Loaded 200000\n",
      "DEBUG:root:Loaded 250000\n",
      "INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_002.json.gz\n",
      "INFO:root:Loading from tweets/7bff8603fb4a49d5953197361d548346_001.json.gz\n",
      "DEBUG:root:Loaded 300000\n",
      "DEBUG:root:Loaded 350000\n",
      "DEBUG:root:Loaded 400000\n",
      "DEBUG:root:Loaded 450000\n",
      "INFO:root:Loading from tweets/b3f330f5b6cc4572b6d7dabc3752b2b9_001.json.gz\n",
      "DEBUG:root:Loaded 500000\n",
      "DEBUG:root:Loaded 550000\n",
      "DEBUG:root:Loaded 600000\n",
      "DEBUG:root:Loaded 650000\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "tweet_id            650350\n",
       "user_id             650350\n",
       "screen_name         650350\n",
       "tweet_created_at    650350\n",
       "user_created_at     650350\n",
       "tweets_to_date      650350\n",
       "tweet_type          650350\n",
       "dtype: int64"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import logging\n",
    "from dateutil.parser import parse as date_parse\n",
    "from utils import load_tweet_df, tweet_type\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "logger = logging.getLogger()\n",
    "logger.setLevel(logging.DEBUG)\n",
    "\n",
    "# Set float format so doesn't display scientific notation\n",
    "pd.options.display.float_format = '{:20,.2f}'.format\n",
    "\n",
    "def tweet_transform(tweet):\n",
    "    return {\n",
    "        'tweet_id': tweet['id_str'], \n",
    "        'tweet_created_at': date_parse(tweet['created_at']),\n",
    "        'user_id': tweet['user']['id_str'],\n",
    "        'screen_name': tweet['user']['screen_name'],\n",
    "        'user_created_at': date_parse(tweet['user']['created_at']),\n",
    "        'tweets_to_date': tweet['user']['statuses_count'],\n",
    "        'tweet_type': tweet_type(tweet)\n",
    "    }\n",
    "\n",
    "tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', \n",
    "                                           'user_created_at', 'tweets_to_date', 'tweet_type'])\n",
    "tweet_df.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View the top of the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweet_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>screen_name</th>\n",
       "      <th>tweet_created_at</th>\n",
       "      <th>user_created_at</th>\n",
       "      <th>tweets_to_date</th>\n",
       "      <th>tweet_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>859463382042378240</td>\n",
       "      <td>2343897943</td>\n",
       "      <td>AmberCStrong</td>\n",
       "      <td>2017-05-02 17:43:32+00:00</td>\n",
       "      <td>2014-02-14 17:33:36+00:00</td>\n",
       "      <td>1701</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>859803200152588288</td>\n",
       "      <td>307982591</td>\n",
       "      <td>JaxAlemany</td>\n",
       "      <td>2017-05-03 16:13:51+00:00</td>\n",
       "      <td>2011-05-30 16:43:13+00:00</td>\n",
       "      <td>6328</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>859788527705493504</td>\n",
       "      <td>307982591</td>\n",
       "      <td>JaxAlemany</td>\n",
       "      <td>2017-05-03 15:15:33+00:00</td>\n",
       "      <td>2011-05-30 16:43:13+00:00</td>\n",
       "      <td>6328</td>\n",
       "      <td>quote</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>859788479076732930</td>\n",
       "      <td>307982591</td>\n",
       "      <td>JaxAlemany</td>\n",
       "      <td>2017-05-03 15:15:22+00:00</td>\n",
       "      <td>2011-05-30 16:43:13+00:00</td>\n",
       "      <td>6328</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>859781841955500032</td>\n",
       "      <td>307982591</td>\n",
       "      <td>JaxAlemany</td>\n",
       "      <td>2017-05-03 14:48:59+00:00</td>\n",
       "      <td>2011-05-30 16:43:13+00:00</td>\n",
       "      <td>6328</td>\n",
       "      <td>retweet</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             tweet_id     user_id   screen_name          tweet_created_at  \\\n",
       "0  859463382042378240  2343897943  AmberCStrong 2017-05-02 17:43:32+00:00   \n",
       "1  859803200152588288   307982591    JaxAlemany 2017-05-03 16:13:51+00:00   \n",
       "2  859788527705493504   307982591    JaxAlemany 2017-05-03 15:15:33+00:00   \n",
       "3  859788479076732930   307982591    JaxAlemany 2017-05-03 15:15:22+00:00   \n",
       "4  859781841955500032   307982591    JaxAlemany 2017-05-03 14:48:59+00:00   \n",
       "\n",
       "            user_created_at  tweets_to_date tweet_type  \n",
       "0 2014-02-14 17:33:36+00:00            1701   original  \n",
       "1 2011-05-30 16:43:13+00:00            6328   original  \n",
       "2 2011-05-30 16:43:13+00:00            6328      quote  \n",
       "3 2011-05-30 16:43:13+00:00            6328   original  \n",
       "4 2011-05-30 16:43:13+00:00            6328    retweet  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweet_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare the user data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tweets in dataset for each user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>tweet_type</th>\n",
       "      <th>original</th>\n",
       "      <th>quote</th>\n",
       "      <th>reply</th>\n",
       "      <th>retweet</th>\n",
       "      <th>tweets_in_dataset</th>\n",
       "      <th>tweets_in_dataset_bin</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1001991865</th>\n",
       "      <td>12.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>3.00</td>\n",
       "      <td>35.00</td>\n",
       "      <td>51.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1002229862</th>\n",
       "      <td>35.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>2.00</td>\n",
       "      <td>99.00</td>\n",
       "      <td>141.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100802089</th>\n",
       "      <td>4.00</td>\n",
       "      <td>3.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>12.00</td>\n",
       "      <td>24.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100860790</th>\n",
       "      <td>117.00</td>\n",
       "      <td>19.00</td>\n",
       "      <td>9.00</td>\n",
       "      <td>215.00</td>\n",
       "      <td>360.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1009749229</th>\n",
       "      <td>79.00</td>\n",
       "      <td>85.00</td>\n",
       "      <td>34.00</td>\n",
       "      <td>156.00</td>\n",
       "      <td>354.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "tweet_type             original                quote                reply  \\\n",
       "user_id                                                                     \n",
       "1001991865                12.00                 1.00                 3.00   \n",
       "1002229862                35.00                 5.00                 2.00   \n",
       "100802089                  4.00                 3.00                 5.00   \n",
       "100860790                117.00                19.00                 9.00   \n",
       "1009749229                79.00                85.00                34.00   \n",
       "\n",
       "tweet_type              retweet    tweets_in_dataset tweets_in_dataset_bin  \n",
       "user_id                                                                     \n",
       "1001991865                35.00                51.00            Bottom 90%  \n",
       "1002229862                99.00               141.00            Bottom 90%  \n",
       "100802089                 12.00                24.00            Bottom 90%  \n",
       "100860790                215.00               360.00            Bottom 90%  \n",
       "1009749229               156.00               354.00            Bottom 90%  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()\n",
    "user_tweet_count_df.fillna(0, inplace=True)\n",
    "user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet\n",
    "user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])\n",
    "user_tweet_count_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load and join user info\n",
    "This is information that was coded in the spreadsheet or looked up for each user via API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "screen_name        2484\n",
       "name               2484\n",
       "organization       2455\n",
       "position           2481\n",
       "gender             2483\n",
       "followers_count    2484\n",
       "following_count    2484\n",
       "tweet_count        2484\n",
       "user_created_at    2484\n",
       "verified           2484\n",
       "protected          2484\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position',\n",
    "                                            'gender', 'followers_count', 'following_count', 'tweet_count',\n",
    "                                            'user_created_at', 'verified', 'protected'],\n",
    "                          dtype={'user_id': str}).set_index(['user_id'])\n",
    "user_info_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>screen_name</th>\n",
       "      <th>name</th>\n",
       "      <th>organization</th>\n",
       "      <th>position</th>\n",
       "      <th>gender</th>\n",
       "      <th>followers_count</th>\n",
       "      <th>following_count</th>\n",
       "      <th>tweet_count</th>\n",
       "      <th>user_created_at</th>\n",
       "      <th>verified</th>\n",
       "      <th>protected</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20711445</th>\n",
       "      <td>ninglin</td>\n",
       "      <td>Glinski, Nina</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Freelance Reporter</td>\n",
       "      <td>F</td>\n",
       "      <td>968</td>\n",
       "      <td>507</td>\n",
       "      <td>909</td>\n",
       "      <td>Thu Feb 12 20:00:53 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258917371</th>\n",
       "      <td>davidjenders</td>\n",
       "      <td>Enders, David</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Journalist</td>\n",
       "      <td>M</td>\n",
       "      <td>1451</td>\n",
       "      <td>480</td>\n",
       "      <td>6299</td>\n",
       "      <td>Mon Feb 28 19:52:03 +0000 2011</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>297046834</th>\n",
       "      <td>mattbarakat</td>\n",
       "      <td>Barakat, Matthew</td>\n",
       "      <td>Associated Press</td>\n",
       "      <td>Northern Virginia Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>754</td>\n",
       "      <td>349</td>\n",
       "      <td>620</td>\n",
       "      <td>Wed May 11 20:55:24 +0000 2011</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455585786</th>\n",
       "      <td>kimberlyeatkins</td>\n",
       "      <td>Atkins, Kimberly</td>\n",
       "      <td>Boston Herald</td>\n",
       "      <td>Chief Washington Reporter/Columnist</td>\n",
       "      <td>F</td>\n",
       "      <td>2399</td>\n",
       "      <td>2661</td>\n",
       "      <td>5846</td>\n",
       "      <td>Thu Jan 05 08:26:46 +0000 2012</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42584840</th>\n",
       "      <td>toulavlahou</td>\n",
       "      <td>Vlahou, Toula</td>\n",
       "      <td>CQ Roll Call</td>\n",
       "      <td>Editor &amp; Podcast Producer</td>\n",
       "      <td>F</td>\n",
       "      <td>2713</td>\n",
       "      <td>198</td>\n",
       "      <td>6325</td>\n",
       "      <td>Tue May 26 07:41:38 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               screen_name              name      organization  \\\n",
       "user_id                                                          \n",
       "20711445           ninglin     Glinski, Nina               NaN   \n",
       "258917371     davidjenders     Enders, David               NaN   \n",
       "297046834      mattbarakat  Barakat, Matthew  Associated Press   \n",
       "455585786  kimberlyeatkins  Atkins, Kimberly     Boston Herald   \n",
       "42584840       toulavlahou     Vlahou, Toula      CQ Roll Call   \n",
       "\n",
       "                                      position gender  followers_count  \\\n",
       "user_id                                                                  \n",
       "20711445                    Freelance Reporter      F              968   \n",
       "258917371                           Journalist      M             1451   \n",
       "297046834      Northern Virginia Correspondent      M              754   \n",
       "455585786  Chief Washington Reporter/Columnist      F             2399   \n",
       "42584840             Editor & Podcast Producer      F             2713   \n",
       "\n",
       "           following_count  tweet_count                 user_created_at  \\\n",
       "user_id                                                                   \n",
       "20711445               507          909  Thu Feb 12 20:00:53 +0000 2009   \n",
       "258917371              480         6299  Mon Feb 28 19:52:03 +0000 2011   \n",
       "297046834              349          620  Wed May 11 20:55:24 +0000 2011   \n",
       "455585786             2661         5846  Thu Jan 05 08:26:46 +0000 2012   \n",
       "42584840               198         6325  Tue May 26 07:41:38 +0000 2009   \n",
       "\n",
       "          verified protected  \n",
       "user_id                       \n",
       "20711445     False     False  \n",
       "258917371     True     False  \n",
       "297046834     True     False  \n",
       "455585786     True     False  \n",
       "42584840     False     False  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "screen_name              2484\n",
       "name                     2484\n",
       "organization             2484\n",
       "position                 2481\n",
       "gender                   2483\n",
       "followers_count          2484\n",
       "following_count          2484\n",
       "tweet_count              2484\n",
       "user_created_at          2484\n",
       "verified                 2484\n",
       "protected                2484\n",
       "original                 2484\n",
       "quote                    2484\n",
       "reply                    2484\n",
       "retweet                  2484\n",
       "tweets_in_dataset        2484\n",
       "tweets_in_dataset_bin    2272\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Join\n",
    "user_summary_df = user_info_df.join(user_tweet_count_df, how='left')\n",
    "# Fill Nans\n",
    "user_summary_df['organization'].fillna('', inplace=True)\n",
    "user_summary_df['original'].fillna(0, inplace=True)\n",
    "user_summary_df['quote'].fillna(0, inplace=True)\n",
    "user_summary_df['reply'].fillna(0, inplace=True)\n",
    "user_summary_df['retweet'].fillna(0, inplace=True)\n",
    "user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)\n",
    "user_summary_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>screen_name</th>\n",
       "      <th>name</th>\n",
       "      <th>organization</th>\n",
       "      <th>position</th>\n",
       "      <th>gender</th>\n",
       "      <th>followers_count</th>\n",
       "      <th>following_count</th>\n",
       "      <th>tweet_count</th>\n",
       "      <th>user_created_at</th>\n",
       "      <th>verified</th>\n",
       "      <th>protected</th>\n",
       "      <th>original</th>\n",
       "      <th>quote</th>\n",
       "      <th>reply</th>\n",
       "      <th>retweet</th>\n",
       "      <th>tweets_in_dataset</th>\n",
       "      <th>tweets_in_dataset_bin</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20711445</th>\n",
       "      <td>ninglin</td>\n",
       "      <td>Glinski, Nina</td>\n",
       "      <td></td>\n",
       "      <td>Freelance Reporter</td>\n",
       "      <td>F</td>\n",
       "      <td>968</td>\n",
       "      <td>507</td>\n",
       "      <td>909</td>\n",
       "      <td>Thu Feb 12 20:00:53 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>258917371</th>\n",
       "      <td>davidjenders</td>\n",
       "      <td>Enders, David</td>\n",
       "      <td></td>\n",
       "      <td>Journalist</td>\n",
       "      <td>M</td>\n",
       "      <td>1451</td>\n",
       "      <td>480</td>\n",
       "      <td>6299</td>\n",
       "      <td>Mon Feb 28 19:52:03 +0000 2011</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>297046834</th>\n",
       "      <td>mattbarakat</td>\n",
       "      <td>Barakat, Matthew</td>\n",
       "      <td>Associated Press</td>\n",
       "      <td>Northern Virginia Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>754</td>\n",
       "      <td>349</td>\n",
       "      <td>620</td>\n",
       "      <td>Wed May 11 20:55:24 +0000 2011</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>12.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.00</td>\n",
       "      <td>14.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455585786</th>\n",
       "      <td>kimberlyeatkins</td>\n",
       "      <td>Atkins, Kimberly</td>\n",
       "      <td>Boston Herald</td>\n",
       "      <td>Chief Washington Reporter/Columnist</td>\n",
       "      <td>F</td>\n",
       "      <td>2399</td>\n",
       "      <td>2661</td>\n",
       "      <td>5846</td>\n",
       "      <td>Thu Jan 05 08:26:46 +0000 2012</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>228.00</td>\n",
       "      <td>144.00</td>\n",
       "      <td>39.00</td>\n",
       "      <td>196.00</td>\n",
       "      <td>607.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42584840</th>\n",
       "      <td>toulavlahou</td>\n",
       "      <td>Vlahou, Toula</td>\n",
       "      <td>CQ Roll Call</td>\n",
       "      <td>Editor &amp; Podcast Producer</td>\n",
       "      <td>F</td>\n",
       "      <td>2713</td>\n",
       "      <td>198</td>\n",
       "      <td>6325</td>\n",
       "      <td>Tue May 26 07:41:38 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>32.00</td>\n",
       "      <td>25.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>25.00</td>\n",
       "      <td>82.00</td>\n",
       "      <td>Bottom 90%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               screen_name              name      organization  \\\n",
       "user_id                                                          \n",
       "20711445           ninglin     Glinski, Nina                     \n",
       "258917371     davidjenders     Enders, David                     \n",
       "297046834      mattbarakat  Barakat, Matthew  Associated Press   \n",
       "455585786  kimberlyeatkins  Atkins, Kimberly     Boston Herald   \n",
       "42584840       toulavlahou     Vlahou, Toula      CQ Roll Call   \n",
       "\n",
       "                                      position gender  followers_count  \\\n",
       "user_id                                                                  \n",
       "20711445                    Freelance Reporter      F              968   \n",
       "258917371                           Journalist      M             1451   \n",
       "297046834      Northern Virginia Correspondent      M              754   \n",
       "455585786  Chief Washington Reporter/Columnist      F             2399   \n",
       "42584840             Editor & Podcast Producer      F             2713   \n",
       "\n",
       "           following_count  tweet_count                 user_created_at  \\\n",
       "user_id                                                                   \n",
       "20711445               507          909  Thu Feb 12 20:00:53 +0000 2009   \n",
       "258917371              480         6299  Mon Feb 28 19:52:03 +0000 2011   \n",
       "297046834              349          620  Wed May 11 20:55:24 +0000 2011   \n",
       "455585786             2661         5846  Thu Jan 05 08:26:46 +0000 2012   \n",
       "42584840               198         6325  Tue May 26 07:41:38 +0000 2009   \n",
       "\n",
       "          verified protected             original                quote  \\\n",
       "user_id                                                                  \n",
       "20711445     False     False                 0.00                 0.00   \n",
       "258917371     True     False                 0.00                 0.00   \n",
       "297046834     True     False                12.00                 0.00   \n",
       "455585786     True     False               228.00               144.00   \n",
       "42584840     False     False                32.00                25.00   \n",
       "\n",
       "                         reply              retweet    tweets_in_dataset  \\\n",
       "user_id                                                                    \n",
       "20711445                  0.00                 0.00                 0.00   \n",
       "258917371                 0.00                 0.00                 0.00   \n",
       "297046834                 0.00                 2.00                14.00   \n",
       "455585786                39.00               196.00               607.00   \n",
       "42584840                  0.00                25.00                82.00   \n",
       "\n",
       "          tweets_in_dataset_bin  \n",
       "user_id                          \n",
       "20711445                    NaN  \n",
       "258917371                   NaN  \n",
       "297046834            Bottom 90%  \n",
       "455585786            Bottom 90%  \n",
       "42584840             Bottom 90%  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_summary_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write to file as output/user_summary.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "user_summary_df.to_csv('output/user_summary.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare the organization data\n",
    "This is for users that are members of each organization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "followers_count    sum        347\n",
       "                   size       347\n",
       "                   average    347\n",
       "following_count    sum        347\n",
       "                   size       347\n",
       "                   average    347\n",
       "tweet_count        sum        347\n",
       "                   size       347\n",
       "                   average    347\n",
       "tweets_in_dataset  sum        347\n",
       "                   size       347\n",
       "                   average    347\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average])\n",
    "org_summary_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">followers_count</th>\n",
       "      <th colspan=\"3\" halign=\"left\">following_count</th>\n",
       "      <th colspan=\"3\" halign=\"left\">tweet_count</th>\n",
       "      <th colspan=\"3\" halign=\"left\">tweets_in_dataset</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>organization</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>57347</td>\n",
       "      <td>29</td>\n",
       "      <td>1,977.48</td>\n",
       "      <td>30788</td>\n",
       "      <td>29</td>\n",
       "      <td>1,061.66</td>\n",
       "      <td>151441</td>\n",
       "      <td>29</td>\n",
       "      <td>5,222.10</td>\n",
       "      <td>2,767.00</td>\n",
       "      <td>29.00</td>\n",
       "      <td>95.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ABC 7</th>\n",
       "      <td>889</td>\n",
       "      <td>1</td>\n",
       "      <td>889.00</td>\n",
       "      <td>1092</td>\n",
       "      <td>1</td>\n",
       "      <td>1,092.00</td>\n",
       "      <td>1946</td>\n",
       "      <td>1</td>\n",
       "      <td>1,946.00</td>\n",
       "      <td>464.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>464.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ABC News</th>\n",
       "      <td>602790</td>\n",
       "      <td>52</td>\n",
       "      <td>11,592.12</td>\n",
       "      <td>72154</td>\n",
       "      <td>52</td>\n",
       "      <td>1,387.58</td>\n",
       "      <td>372200</td>\n",
       "      <td>52</td>\n",
       "      <td>7,157.69</td>\n",
       "      <td>8,629.00</td>\n",
       "      <td>52.00</td>\n",
       "      <td>165.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AP–Broadcast</th>\n",
       "      <td>5305</td>\n",
       "      <td>15</td>\n",
       "      <td>353.67</td>\n",
       "      <td>7974</td>\n",
       "      <td>15</td>\n",
       "      <td>531.60</td>\n",
       "      <td>16794</td>\n",
       "      <td>15</td>\n",
       "      <td>1,119.60</td>\n",
       "      <td>527.00</td>\n",
       "      <td>15.00</td>\n",
       "      <td>35.13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Afro American Newspapers</th>\n",
       "      <td>189</td>\n",
       "      <td>1</td>\n",
       "      <td>189.00</td>\n",
       "      <td>202</td>\n",
       "      <td>1</td>\n",
       "      <td>202.00</td>\n",
       "      <td>596</td>\n",
       "      <td>1</td>\n",
       "      <td>596.00</td>\n",
       "      <td>14.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>14.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         followers_count                            \\\n",
       "                                     sum size              average   \n",
       "organization                                                         \n",
       "                                   57347   29             1,977.48   \n",
       "ABC 7                                889    1               889.00   \n",
       "ABC News                          602790   52            11,592.12   \n",
       "AP–Broadcast                        5305   15               353.67   \n",
       "Afro American Newspapers             189    1               189.00   \n",
       "\n",
       "                         following_count                            \\\n",
       "                                     sum size              average   \n",
       "organization                                                         \n",
       "                                   30788   29             1,061.66   \n",
       "ABC 7                               1092    1             1,092.00   \n",
       "ABC News                           72154   52             1,387.58   \n",
       "AP–Broadcast                        7974   15               531.60   \n",
       "Afro American Newspapers             202    1               202.00   \n",
       "\n",
       "                         tweet_count                            \\\n",
       "                                 sum size              average   \n",
       "organization                                                     \n",
       "                              151441   29             5,222.10   \n",
       "ABC 7                           1946    1             1,946.00   \n",
       "ABC News                      372200   52             7,157.69   \n",
       "AP–Broadcast                   16794   15             1,119.60   \n",
       "Afro American Newspapers         596    1               596.00   \n",
       "\n",
       "                            tweets_in_dataset                       \\\n",
       "                                          sum                 size   \n",
       "organization                                                         \n",
       "                                     2,767.00                29.00   \n",
       "ABC 7                                  464.00                 1.00   \n",
       "ABC News                             8,629.00                52.00   \n",
       "AP–Broadcast                           527.00                15.00   \n",
       "Afro American Newspapers                14.00                 1.00   \n",
       "\n",
       "                                               \n",
       "                                      average  \n",
       "organization                                   \n",
       "                                        95.41  \n",
       "ABC 7                                  464.00  \n",
       "ABC News                               165.94  \n",
       "AP–Broadcast                            35.13  \n",
       "Afro American Newspapers                14.00  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write to file as output/organization_summary.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "org_summary_df.to_csv('output/organization_summary.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List of organizations <--- This probably requires some cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['',\n",
       " 'ABC 7',\n",
       " 'ABC News',\n",
       " 'AP–Broadcast',\n",
       " 'Afro American Newspapers',\n",
       " 'Agence France Presse (AFP–TV)',\n",
       " 'Agence France-Presse',\n",
       " 'Agri-Pulse',\n",
       " 'Air Force Magazine',\n",
       " 'Alaska Dispatch News',\n",
       " 'Alaska Public Radio Network',\n",
       " 'Albuquerque Journal',\n",
       " 'Aljazeera America',\n",
       " 'Aljazeera English',\n",
       " 'Allentown Morning Call',\n",
       " 'American Banker',\n",
       " 'American Gaming Association',\n",
       " 'American Prospect',\n",
       " 'Argus Media',\n",
       " 'Army Times',\n",
       " 'Associated Press',\n",
       " 'Atlanta Journal-Consitution',\n",
       " 'Austin American-Statesman',\n",
       " 'Axios',\n",
       " 'BBC',\n",
       " 'Baltimore Sun',\n",
       " 'Bankrate',\n",
       " 'Bloomberg BNA',\n",
       " 'Bloomberg Government',\n",
       " 'Bloomberg News',\n",
       " 'Bloomberg TV',\n",
       " 'Bond Buyer',\n",
       " 'Boston Globe',\n",
       " 'Boston Herald',\n",
       " 'Breitbart News',\n",
       " 'Broadcasting & Cable',\n",
       " 'Buffalo News',\n",
       " 'BuzzFeed',\n",
       " 'Buzzfeed',\n",
       " 'CBN News',\n",
       " 'CBS News',\n",
       " 'CDC Gaming Reports',\n",
       " 'CEO Update',\n",
       " 'CNBC',\n",
       " 'CNN',\n",
       " 'CNN International',\n",
       " 'CNSNews.com',\n",
       " 'CQ Researcher',\n",
       " 'CQ Roll Call',\n",
       " 'CRTV',\n",
       " 'CTV–Community TV of PG County',\n",
       " 'Canadian Press',\n",
       " 'Carroll County Times',\n",
       " 'Center for Public Integrity',\n",
       " 'Charleston Post and Courier',\n",
       " 'Chicago Sun-Times',\n",
       " 'Chicago Tribune',\n",
       " 'Christian Science Monitor',\n",
       " 'Chronicle of Higher Education',\n",
       " 'Chronicle of Philanthropy',\n",
       " 'Circa',\n",
       " 'CityLab',\n",
       " 'Cleveland Plain Dealer',\n",
       " 'Colorado Public Radio',\n",
       " 'Columbus Dispatch',\n",
       " 'Communications Daily',\n",
       " 'Consumer Reports',\n",
       " 'Cook Political Report',\n",
       " 'Corporate Crime Reporter',\n",
       " 'Cosmopolitan',\n",
       " 'Court House News',\n",
       " 'Cox Broadcasting',\n",
       " 'Crain Communications',\n",
       " 'Cronkite News Service',\n",
       " 'Crux: Catholic News Agency',\n",
       " 'C–SPAN',\n",
       " 'DC Spotlight Newspaper',\n",
       " 'DCist',\n",
       " 'Daily Beast',\n",
       " 'Daily Caller',\n",
       " 'Daily Deal',\n",
       " 'Daily Mail',\n",
       " 'Daily Mail (UK)',\n",
       " 'Dallas Morning News',\n",
       " 'Defense Daily',\n",
       " 'Defense News',\n",
       " 'Defense One',\n",
       " 'Denver Post',\n",
       " 'Detroit News',\n",
       " 'Diverse: Issues in Higher Education',\n",
       " 'E! Networks',\n",
       " 'E&E News',\n",
       " 'EWTN',\n",
       " 'Eater',\n",
       " 'Economist',\n",
       " 'Education Week',\n",
       " 'Energy Daily',\n",
       " 'Energy Intelligence',\n",
       " 'Environment & Energy Publishing, LLC',\n",
       " \"FERN's Ag Insider\",\n",
       " 'FTC Watch',\n",
       " 'Fairchild Publications',\n",
       " 'Falls Church News Press',\n",
       " 'Famous DC',\n",
       " 'Feature Story News',\n",
       " 'FedNet',\n",
       " 'Federal Computer Week',\n",
       " 'Federal News Radio 1500 AM',\n",
       " 'Financial Times',\n",
       " 'Financial Times ',\n",
       " 'Fiscal Times',\n",
       " 'FiveThirtyEight',\n",
       " 'Foreign Policy',\n",
       " 'Fortune Magazine',\n",
       " 'Fox Business Network',\n",
       " 'Fox News',\n",
       " 'Fox News Radio',\n",
       " 'France24',\n",
       " 'Freelance',\n",
       " 'Freelance ',\n",
       " 'Frontline Medical Communications',\n",
       " 'Fusion',\n",
       " 'Gannett Government Media Corp',\n",
       " 'Gannett Washington Bureau',\n",
       " 'Glamour Magazine',\n",
       " 'Global Competition Review',\n",
       " 'Globe and Mail',\n",
       " 'Governing',\n",
       " 'Government Executive',\n",
       " 'Gray Television',\n",
       " 'Guardian US',\n",
       " 'Haddad Media',\n",
       " 'Hearst Newspapers',\n",
       " 'Hearst Television Inc.',\n",
       " 'Hispanic Outlook',\n",
       " 'Honolulu Civil Beat',\n",
       " 'Houston Chronicle',\n",
       " 'Huffington Post',\n",
       " 'IDG Communications',\n",
       " 'IDG News Service',\n",
       " 'Independent Journal Review',\n",
       " 'Independent Television News (ITN)',\n",
       " 'Industry Dive',\n",
       " 'Informavore Media, LLC',\n",
       " 'Inside Elections',\n",
       " 'InsideClimate News',\n",
       " 'InsidePolitics',\n",
       " 'Internews Network',\n",
       " 'Investor’s Business Daily',\n",
       " 'Irish Times',\n",
       " 'Jewish Journal',\n",
       " 'Jewish Telegraphic Agency',\n",
       " 'Journal Media Group',\n",
       " 'KATU News',\n",
       " 'KCETLink',\n",
       " 'KFI',\n",
       " 'KNTV',\n",
       " 'KTWO TV',\n",
       " 'Kaiser Health News',\n",
       " 'Kansas City Star',\n",
       " 'LRP Publications',\n",
       " 'Laslo Congressional Bureau',\n",
       " 'Lilly Broadcasting',\n",
       " 'LocalNews Now',\n",
       " 'Los Angeles Times',\n",
       " 'MLEX US',\n",
       " 'MRCTV',\n",
       " 'MSNBC',\n",
       " 'MTV News',\n",
       " 'Manifest',\n",
       " 'MapLight',\n",
       " 'Market News International',\n",
       " 'MarketWatch',\n",
       " 'Marketplace Radio',\n",
       " 'McClatchy',\n",
       " 'McClatchy Newspapers',\n",
       " 'MedPage Today',\n",
       " 'MedTech Insight',\n",
       " 'Media General',\n",
       " 'Merger Market of Financial Times',\n",
       " 'Metro Weekly',\n",
       " 'Mic',\n",
       " 'Military.com',\n",
       " 'MinnPost',\n",
       " 'Minneapolis Star Tribune',\n",
       " 'Montgomery County Sentinel',\n",
       " 'Morning Consult',\n",
       " 'Morning Edition',\n",
       " 'Mother Jones',\n",
       " 'NBC',\n",
       " 'NBC News',\n",
       " 'NBC Newschannel',\n",
       " 'NJ Advance Media',\n",
       " 'Nation',\n",
       " 'National Catholic Reporter',\n",
       " 'National Geographic Magazine',\n",
       " 'National Journal',\n",
       " 'National Law Journal',\n",
       " 'National Mortgage News',\n",
       " 'National Public Radio',\n",
       " 'National Review',\n",
       " 'Nature',\n",
       " 'NerdWallet',\n",
       " 'New Republic',\n",
       " 'New York ',\n",
       " 'New York Daily News',\n",
       " 'New York Post',\n",
       " 'New York Times',\n",
       " 'New York Times Magazine',\n",
       " 'New Yorker',\n",
       " 'NewsMax',\n",
       " 'Newsday',\n",
       " 'Newsweek',\n",
       " 'Nexstar Media Group',\n",
       " 'Omaha World-Herald',\n",
       " 'Ozy',\n",
       " 'PBS',\n",
       " 'PBS NewsHour',\n",
       " 'People Magazine',\n",
       " 'Pew Charitable Trusts',\n",
       " 'Philadelphia Inquirer',\n",
       " 'Pittsburgh Post-Gazette',\n",
       " 'Politico',\n",
       " 'Power Markets Today',\n",
       " 'Praetorian Digital',\n",
       " 'ProPublica',\n",
       " 'RTTV America',\n",
       " 'Radio Free Asia',\n",
       " 'Radio One',\n",
       " 'Real Clear Politics',\n",
       " 'Real News Network',\n",
       " 'RealClearPolitics',\n",
       " 'Record (Bergen County, NJ)',\n",
       " 'Religion & Ethics Newsweekly',\n",
       " 'Religion News Service',\n",
       " 'Religious News Service',\n",
       " 'Reuters Radio & TV',\n",
       " 'Rural TV News',\n",
       " 'S&P Global Market Intelligence',\n",
       " 'S&P Global Platts',\n",
       " 'SAGE Business Researcher',\n",
       " 'SB Nation',\n",
       " 'SRN News (Salem)',\n",
       " 'Salt Lake Tribune',\n",
       " 'San Francisco Chronicle',\n",
       " 'Scientific American',\n",
       " 'Scoop News',\n",
       " 'Scripps Howard News Service',\n",
       " 'Scripps News',\n",
       " 'Scudder Publishing',\n",
       " 'Senate Democrats',\n",
       " 'Sightline Media Group',\n",
       " 'Sinclair Broadcast Group',\n",
       " 'Sirius XM',\n",
       " 'Sirius XM Satellite Radio',\n",
       " 'Sky News',\n",
       " 'Slate',\n",
       " 'Smithsonian Magazine',\n",
       " 'Snapchat',\n",
       " 'Space News',\n",
       " 'St. Louis Post-Dispatch',\n",
       " 'St. Louis Public Radio',\n",
       " 'Standard - Examiner',\n",
       " 'Stars and Stripes',\n",
       " 'Stat News',\n",
       " 'Stateline.org',\n",
       " 'Stephens Media Group',\n",
       " 'SurveyMonkey',\n",
       " 'Syracuse Post-Standard',\n",
       " 'TEGNA',\n",
       " 'Talk Radio News Service',\n",
       " 'Talking Points Memo',\n",
       " 'Tampa Bay Times',\n",
       " 'Telemundo Network',\n",
       " 'Texas Tribune',\n",
       " 'The 74 Media',\n",
       " 'The Atlantic',\n",
       " 'The Cipher Brief',\n",
       " 'The Hill',\n",
       " 'The Hotline',\n",
       " 'The New York Times On The Web',\n",
       " 'The Root',\n",
       " 'The Voyage Report',\n",
       " 'The atlantic',\n",
       " 'TheStreet',\n",
       " 'ThinkProgress',\n",
       " 'This Is America with Dennis Wholey',\n",
       " 'Thom Hartmann Program',\n",
       " 'Thomsen Reuters',\n",
       " 'Thomson Reuters',\n",
       " 'Time Magazine',\n",
       " 'Time Warner Cable',\n",
       " 'Times of London',\n",
       " 'To The Contrary (Persephone Productions)',\n",
       " 'Toronto Star',\n",
       " 'TownHall',\n",
       " 'Townhall',\n",
       " 'Transport Topics',\n",
       " 'Trinity Broadcast Network',\n",
       " 'U.S. News & World Report',\n",
       " 'UCG',\n",
       " 'USA Today',\n",
       " 'Univision',\n",
       " 'Vanity Fair',\n",
       " 'Variety',\n",
       " 'Vice News',\n",
       " 'Voice of America',\n",
       " 'Voterama in Congress',\n",
       " 'Vox ',\n",
       " 'Vox Media',\n",
       " 'WBAL-TV',\n",
       " 'WBALL TV 11',\n",
       " 'WFDC–TV Univision',\n",
       " 'WJLA–TV / Newschannel 8',\n",
       " 'WMAL Radio',\n",
       " 'WMDT',\n",
       " 'WNEW / CBS DC',\n",
       " 'WNYC',\n",
       " 'WPFW–FM',\n",
       " 'WRC–TV / NBC–4',\n",
       " 'WTOP',\n",
       " 'WTOP Radio',\n",
       " 'WTTG-TV',\n",
       " 'WTTG–Fox Television',\n",
       " 'WUSA–TV',\n",
       " 'Wall Street Journal',\n",
       " 'Wall Street Journal / Dow Jones',\n",
       " 'Washington Blade',\n",
       " 'Washington Bureau News Service',\n",
       " 'Washington Business Journal',\n",
       " 'Washington City Paper',\n",
       " 'Washington Examiner',\n",
       " 'Washington Free Beacon',\n",
       " 'Washington Post',\n",
       " 'Washington Radio & Press Service',\n",
       " 'Washington Times',\n",
       " 'Washingtonian',\n",
       " 'Washingtonpost.com',\n",
       " 'Weekly Standard',\n",
       " 'West Wing Writers',\n",
       " 'Westwood One',\n",
       " 'White House Dossier',\n",
       " 'Wired',\n",
       " 'Wisconsin NewsHour',\n",
       " 'World Magazine',\n",
       " 'Yahoo News',\n",
       " 'ZDNet']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df.index.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tweet summary\n",
    "For tweets in dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Types of tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "retweet     273412\n",
       "original    199949\n",
       "reply        93184\n",
       "quote        83805\n",
       "Name: tweet_type, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweet_df['tweet_type'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## User tweet summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Types of tweets in dataset for each user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>original</th>\n",
       "      <th>quote</th>\n",
       "      <th>reply</th>\n",
       "      <th>retweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2,484.00</td>\n",
       "      <td>2,484.00</td>\n",
       "      <td>2,484.00</td>\n",
       "      <td>2,484.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>79.83</td>\n",
       "      <td>33.54</td>\n",
       "      <td>37.22</td>\n",
       "      <td>109.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>135.84</td>\n",
       "      <td>90.07</td>\n",
       "      <td>186.34</td>\n",
       "      <td>341.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>5.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>29.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>3.00</td>\n",
       "      <td>24.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>99.00</td>\n",
       "      <td>28.00</td>\n",
       "      <td>18.00</td>\n",
       "      <td>94.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1,579.00</td>\n",
       "      <td>1,440.00</td>\n",
       "      <td>7,328.00</td>\n",
       "      <td>8,855.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  original                quote                reply  \\\n",
       "count             2,484.00             2,484.00             2,484.00   \n",
       "mean                 79.83                33.54                37.22   \n",
       "std                 135.84                90.07               186.34   \n",
       "min                   0.00                 0.00                 0.00   \n",
       "25%                   5.00                 0.00                 0.00   \n",
       "50%                  29.00                 5.00                 3.00   \n",
       "75%                  99.00                28.00                18.00   \n",
       "max               1,579.00             1,440.00             7,328.00   \n",
       "\n",
       "                   retweet  \n",
       "count             2,484.00  \n",
       "mean                109.56  \n",
       "std                 341.02  \n",
       "min                   0.00  \n",
       "25%                   3.00  \n",
       "50%                  24.00  \n",
       "75%                  94.25  \n",
       "max               8,855.00  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_summary_df[['original', 'quote', 'reply', 'retweet']].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1/9/90 rule\n",
    "For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>screen_name</th>\n",
       "      <th>name</th>\n",
       "      <th>organization</th>\n",
       "      <th>position</th>\n",
       "      <th>gender</th>\n",
       "      <th>followers_count</th>\n",
       "      <th>following_count</th>\n",
       "      <th>tweet_count</th>\n",
       "      <th>user_created_at</th>\n",
       "      <th>verified</th>\n",
       "      <th>protected</th>\n",
       "      <th>original</th>\n",
       "      <th>quote</th>\n",
       "      <th>reply</th>\n",
       "      <th>retweet</th>\n",
       "      <th>tweets_in_dataset</th>\n",
       "      <th>tweets_in_dataset_bin</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>456994513</th>\n",
       "      <td>maria_e_recio</td>\n",
       "      <td>Recio, Maria</td>\n",
       "      <td>Austin American-Statesman</td>\n",
       "      <td>Political Reporter</td>\n",
       "      <td>F</td>\n",
       "      <td>1039</td>\n",
       "      <td>530</td>\n",
       "      <td>38464</td>\n",
       "      <td>Fri Jan 06 22:22:40 +0000 2012</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>261.00</td>\n",
       "      <td>291.00</td>\n",
       "      <td>108.00</td>\n",
       "      <td>3,204.00</td>\n",
       "      <td>3,864.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22891564</th>\n",
       "      <td>chrisgeidner</td>\n",
       "      <td>Geidner, Chris</td>\n",
       "      <td>BuzzFeed</td>\n",
       "      <td>Legal Editor &amp; Supreme Court Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>78631</td>\n",
       "      <td>4767</td>\n",
       "      <td>201131</td>\n",
       "      <td>Thu Mar 05 06:48:00 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>592.00</td>\n",
       "      <td>475.00</td>\n",
       "      <td>2,850.00</td>\n",
       "      <td>750.00</td>\n",
       "      <td>4,667.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21810329</th>\n",
       "      <td>sdonnan</td>\n",
       "      <td>Donnan, Shawn</td>\n",
       "      <td>Financial Times</td>\n",
       "      <td>Wolrd Trade Editor</td>\n",
       "      <td>M</td>\n",
       "      <td>11693</td>\n",
       "      <td>5428</td>\n",
       "      <td>75733</td>\n",
       "      <td>Tue Feb 24 23:10:17 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>203.00</td>\n",
       "      <td>374.00</td>\n",
       "      <td>152.00</td>\n",
       "      <td>2,792.00</td>\n",
       "      <td>3,521.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19545932</th>\n",
       "      <td>kampeas</td>\n",
       "      <td>Kampeas, Ron</td>\n",
       "      <td>Jewish Telegraphic Agency</td>\n",
       "      <td>Washington Bureau Chief</td>\n",
       "      <td>M</td>\n",
       "      <td>6901</td>\n",
       "      <td>1952</td>\n",
       "      <td>50954</td>\n",
       "      <td>Mon Jan 26 17:37:58 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>506.00</td>\n",
       "      <td>349.00</td>\n",
       "      <td>202.00</td>\n",
       "      <td>2,027.00</td>\n",
       "      <td>3,084.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47408060</th>\n",
       "      <td>jonathanlanday</td>\n",
       "      <td>Landay, Jonathan</td>\n",
       "      <td>McClatchy Newspapers</td>\n",
       "      <td>National Security Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>11126</td>\n",
       "      <td>1093</td>\n",
       "      <td>78318</td>\n",
       "      <td>Mon Jun 15 18:42:47 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>418.00</td>\n",
       "      <td>41.00</td>\n",
       "      <td>70.00</td>\n",
       "      <td>2,352.00</td>\n",
       "      <td>2,881.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3817401</th>\n",
       "      <td>ericgeller</td>\n",
       "      <td>Geller, Eric</td>\n",
       "      <td>Politico</td>\n",
       "      <td>Cybersecurity Reporter</td>\n",
       "      <td>M</td>\n",
       "      <td>52569</td>\n",
       "      <td>732</td>\n",
       "      <td>201279</td>\n",
       "      <td>Sun Apr 08 20:27:11 +0000 2007</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>820.00</td>\n",
       "      <td>1,435.00</td>\n",
       "      <td>7,328.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>9,583.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>593813785</th>\n",
       "      <td>donnayoungdc</td>\n",
       "      <td>Young, Donna</td>\n",
       "      <td>S&amp;P Global Market Intelligence</td>\n",
       "      <td>Senior Reporter</td>\n",
       "      <td>F</td>\n",
       "      <td>5654</td>\n",
       "      <td>1621</td>\n",
       "      <td>46571</td>\n",
       "      <td>Tue May 29 15:45:45 +0000 2012</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>1,095.00</td>\n",
       "      <td>885.00</td>\n",
       "      <td>9.00</td>\n",
       "      <td>1,169.00</td>\n",
       "      <td>3,158.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104299137</th>\n",
       "      <td>davidmdrucker</td>\n",
       "      <td>Drucker, David</td>\n",
       "      <td>Washington Examiner</td>\n",
       "      <td>Senior Political Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>32966</td>\n",
       "      <td>2475</td>\n",
       "      <td>101229</td>\n",
       "      <td>Tue Jan 12 22:56:50 +0000 2010</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>611.00</td>\n",
       "      <td>1,122.00</td>\n",
       "      <td>517.00</td>\n",
       "      <td>934.00</td>\n",
       "      <td>3,184.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61734492</th>\n",
       "      <td>fahrenthold</td>\n",
       "      <td>Fahrenthold, David</td>\n",
       "      <td>Washington Post</td>\n",
       "      <td>Political Reporter</td>\n",
       "      <td>M</td>\n",
       "      <td>419647</td>\n",
       "      <td>3341</td>\n",
       "      <td>25457</td>\n",
       "      <td>Fri Jul 31 09:29:37 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>115.00</td>\n",
       "      <td>142.00</td>\n",
       "      <td>63.00</td>\n",
       "      <td>2,333.00</td>\n",
       "      <td>2,653.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13524182</th>\n",
       "      <td>daveweigel</td>\n",
       "      <td>Weigel, David</td>\n",
       "      <td>Washington Post</td>\n",
       "      <td>Political Reporter</td>\n",
       "      <td>M</td>\n",
       "      <td>318915</td>\n",
       "      <td>10169</td>\n",
       "      <td>166821</td>\n",
       "      <td>Fri Feb 15 17:58:23 +0000 2008</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>712.00</td>\n",
       "      <td>784.00</td>\n",
       "      <td>242.00</td>\n",
       "      <td>2,155.00</td>\n",
       "      <td>3,893.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25702314</th>\n",
       "      <td>ericmgarcia</td>\n",
       "      <td>Garcia, Eric M.</td>\n",
       "      <td>CQ Roll Call</td>\n",
       "      <td>Reporter</td>\n",
       "      <td>M</td>\n",
       "      <td>2960</td>\n",
       "      <td>3748</td>\n",
       "      <td>42198</td>\n",
       "      <td>Sat Mar 21 17:44:40 +0000 2009</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>441.00</td>\n",
       "      <td>1,188.00</td>\n",
       "      <td>575.00</td>\n",
       "      <td>405.00</td>\n",
       "      <td>2,609.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18825339</th>\n",
       "      <td>cahnemily</td>\n",
       "      <td>Cahn, Emily</td>\n",
       "      <td>Mic</td>\n",
       "      <td>Senior Politics Writer</td>\n",
       "      <td>F</td>\n",
       "      <td>16181</td>\n",
       "      <td>2118</td>\n",
       "      <td>95033</td>\n",
       "      <td>Sat Jan 10 03:19:50 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1,205.00</td>\n",
       "      <td>1,440.00</td>\n",
       "      <td>279.00</td>\n",
       "      <td>3,459.00</td>\n",
       "      <td>6,383.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21612122</th>\n",
       "      <td>hotlinejosh</td>\n",
       "      <td>Kraushaar, Josh P.</td>\n",
       "      <td>National Journal</td>\n",
       "      <td>Politics Editor</td>\n",
       "      <td>M</td>\n",
       "      <td>49151</td>\n",
       "      <td>1456</td>\n",
       "      <td>152116</td>\n",
       "      <td>Sun Feb 22 23:45:46 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>395.00</td>\n",
       "      <td>643.00</td>\n",
       "      <td>338.00</td>\n",
       "      <td>4,302.00</td>\n",
       "      <td>5,678.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21696279</th>\n",
       "      <td>brianbeutler</td>\n",
       "      <td>Beutler, Brian Alfred</td>\n",
       "      <td>New Republic</td>\n",
       "      <td>Senior Editor</td>\n",
       "      <td>M</td>\n",
       "      <td>71586</td>\n",
       "      <td>722</td>\n",
       "      <td>96050</td>\n",
       "      <td>Mon Feb 23 21:31:16 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>475.00</td>\n",
       "      <td>546.00</td>\n",
       "      <td>714.00</td>\n",
       "      <td>2,122.00</td>\n",
       "      <td>3,857.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16459325</th>\n",
       "      <td>ryanbeckwith</td>\n",
       "      <td>Beckwith, Ryan Teague</td>\n",
       "      <td>Time Magazine</td>\n",
       "      <td>Politics Editor</td>\n",
       "      <td>M</td>\n",
       "      <td>20241</td>\n",
       "      <td>6826</td>\n",
       "      <td>88797</td>\n",
       "      <td>Thu Sep 25 22:43:36 +0000 2008</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>843.00</td>\n",
       "      <td>529.00</td>\n",
       "      <td>753.00</td>\n",
       "      <td>1,778.00</td>\n",
       "      <td>3,903.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42352386</th>\n",
       "      <td>rschles</td>\n",
       "      <td>Schlesinger, Robert</td>\n",
       "      <td>U.S. News &amp; World Report</td>\n",
       "      <td>Managing Editor, Opinion</td>\n",
       "      <td>M</td>\n",
       "      <td>4426</td>\n",
       "      <td>1910</td>\n",
       "      <td>34044</td>\n",
       "      <td>Mon May 25 04:52:44 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>122.00</td>\n",
       "      <td>590.00</td>\n",
       "      <td>56.00</td>\n",
       "      <td>2,206.00</td>\n",
       "      <td>2,974.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>304988603</th>\n",
       "      <td>neilwmccabe</td>\n",
       "      <td>McCabe, Neil</td>\n",
       "      <td>Breitbart News</td>\n",
       "      <td>Political Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>18991</td>\n",
       "      <td>7699</td>\n",
       "      <td>57983</td>\n",
       "      <td>Wed May 25 13:09:32 +0000 2011</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>682.00</td>\n",
       "      <td>616.00</td>\n",
       "      <td>227.00</td>\n",
       "      <td>4,444.00</td>\n",
       "      <td>5,969.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191964162</th>\n",
       "      <td>samlitzinger</td>\n",
       "      <td>Litzinger, Sam</td>\n",
       "      <td>CBS News</td>\n",
       "      <td>Correspondent, CBS Radio</td>\n",
       "      <td>M</td>\n",
       "      <td>2302</td>\n",
       "      <td>2164</td>\n",
       "      <td>90023</td>\n",
       "      <td>Fri Sep 17 20:37:31 +0000 2010</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>759.00</td>\n",
       "      <td>206.00</td>\n",
       "      <td>430.00</td>\n",
       "      <td>5,331.00</td>\n",
       "      <td>6,726.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259395895</th>\n",
       "      <td>johnjharwood</td>\n",
       "      <td>Harwood, John</td>\n",
       "      <td>CNBC</td>\n",
       "      <td>Chief Washington Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>139370</td>\n",
       "      <td>1157</td>\n",
       "      <td>73724</td>\n",
       "      <td>Tue Mar 01 20:49:40 +0000 2011</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>825.00</td>\n",
       "      <td>487.00</td>\n",
       "      <td>83.00</td>\n",
       "      <td>3,307.00</td>\n",
       "      <td>4,702.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14529929</th>\n",
       "      <td>jaketapper</td>\n",
       "      <td>Tapper, Jake</td>\n",
       "      <td>CNN</td>\n",
       "      <td>Anchor &amp; Chief Washington Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>1238317</td>\n",
       "      <td>5664</td>\n",
       "      <td>144300</td>\n",
       "      <td>Fri Apr 25 17:23:28 +0000 2008</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1,162.00</td>\n",
       "      <td>266.00</td>\n",
       "      <td>645.00</td>\n",
       "      <td>1,295.00</td>\n",
       "      <td>3,368.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15486163</th>\n",
       "      <td>simonmarksfsn</td>\n",
       "      <td>Marks, Simon</td>\n",
       "      <td>Feature Story News</td>\n",
       "      <td>President &amp; Chief Correspondent</td>\n",
       "      <td>M</td>\n",
       "      <td>7622</td>\n",
       "      <td>3632</td>\n",
       "      <td>39421</td>\n",
       "      <td>Fri Jul 18 20:45:38 +0000 2008</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>1,191.00</td>\n",
       "      <td>613.00</td>\n",
       "      <td>189.00</td>\n",
       "      <td>1,017.00</td>\n",
       "      <td>3,010.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19576571</th>\n",
       "      <td>jaredrizzi</td>\n",
       "      <td>Rizzi, Jared</td>\n",
       "      <td>Sirius XM Satellite Radio</td>\n",
       "      <td>White House Correspondent, SXMPOTUS</td>\n",
       "      <td>M</td>\n",
       "      <td>12277</td>\n",
       "      <td>5924</td>\n",
       "      <td>38049</td>\n",
       "      <td>Tue Jan 27 04:09:53 +0000 2009</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>645.00</td>\n",
       "      <td>858.00</td>\n",
       "      <td>1,393.00</td>\n",
       "      <td>2,050.00</td>\n",
       "      <td>4,946.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2453025128</th>\n",
       "      <td>gloriaminott</td>\n",
       "      <td>Minott, Gloria</td>\n",
       "      <td>WPFW–FM</td>\n",
       "      <td>Journalist and Radio Host</td>\n",
       "      <td>F</td>\n",
       "      <td>468</td>\n",
       "      <td>232</td>\n",
       "      <td>45438</td>\n",
       "      <td>Sat Apr 19 12:03:52 +0000 2014</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>8,855.00</td>\n",
       "      <td>8,856.00</td>\n",
       "      <td>Top 1%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               screen_name                   name  \\\n",
       "user_id                                             \n",
       "456994513    maria_e_recio           Recio, Maria   \n",
       "22891564      chrisgeidner         Geidner, Chris   \n",
       "21810329           sdonnan          Donnan, Shawn   \n",
       "19545932           kampeas           Kampeas, Ron   \n",
       "47408060    jonathanlanday       Landay, Jonathan   \n",
       "3817401         ericgeller           Geller, Eric   \n",
       "593813785     donnayoungdc           Young, Donna   \n",
       "104299137    davidmdrucker         Drucker, David   \n",
       "61734492       fahrenthold     Fahrenthold, David   \n",
       "13524182        daveweigel          Weigel, David   \n",
       "25702314       ericmgarcia        Garcia, Eric M.   \n",
       "18825339         cahnemily            Cahn, Emily   \n",
       "21612122       hotlinejosh     Kraushaar, Josh P.   \n",
       "21696279      brianbeutler  Beutler, Brian Alfred   \n",
       "16459325      ryanbeckwith  Beckwith, Ryan Teague   \n",
       "42352386           rschles    Schlesinger, Robert   \n",
       "304988603      neilwmccabe           McCabe, Neil   \n",
       "191964162     samlitzinger         Litzinger, Sam   \n",
       "259395895     johnjharwood          Harwood, John   \n",
       "14529929        jaketapper           Tapper, Jake   \n",
       "15486163     simonmarksfsn           Marks, Simon   \n",
       "19576571        jaredrizzi           Rizzi, Jared   \n",
       "2453025128    gloriaminott         Minott, Gloria   \n",
       "\n",
       "                              organization  \\\n",
       "user_id                                      \n",
       "456994513        Austin American-Statesman   \n",
       "22891564                          BuzzFeed   \n",
       "21810329                   Financial Times   \n",
       "19545932         Jewish Telegraphic Agency   \n",
       "47408060              McClatchy Newspapers   \n",
       "3817401                           Politico   \n",
       "593813785   S&P Global Market Intelligence   \n",
       "104299137              Washington Examiner   \n",
       "61734492                   Washington Post   \n",
       "13524182                   Washington Post   \n",
       "25702314                      CQ Roll Call   \n",
       "18825339                               Mic   \n",
       "21612122                  National Journal   \n",
       "21696279                      New Republic   \n",
       "16459325                     Time Magazine   \n",
       "42352386          U.S. News & World Report   \n",
       "304988603                   Breitbart News   \n",
       "191964162                         CBS News   \n",
       "259395895                             CNBC   \n",
       "14529929                               CNN   \n",
       "15486163                Feature Story News   \n",
       "19576571         Sirius XM Satellite Radio   \n",
       "2453025128                         WPFW–FM   \n",
       "\n",
       "                                              position gender  \\\n",
       "user_id                                                         \n",
       "456994513                           Political Reporter      F   \n",
       "22891564    Legal Editor & Supreme Court Correspondent      M   \n",
       "21810329                            Wolrd Trade Editor      M   \n",
       "19545932                       Washington Bureau Chief      M   \n",
       "47408060               National Security Correspondent      M   \n",
       "3817401                         Cybersecurity Reporter      M   \n",
       "593813785                              Senior Reporter      F   \n",
       "104299137               Senior Political Correspondent      M   \n",
       "61734492                           Political Reporter       M   \n",
       "13524182                            Political Reporter      M   \n",
       "25702314                                      Reporter      M   \n",
       "18825339                        Senior Politics Writer      F   \n",
       "21612122                               Politics Editor      M   \n",
       "21696279                                 Senior Editor      M   \n",
       "16459325                               Politics Editor      M   \n",
       "42352386                      Managing Editor, Opinion      M   \n",
       "304988603                      Political Correspondent      M   \n",
       "191964162                     Correspondent, CBS Radio      M   \n",
       "259395895               Chief Washington Correspondent      M   \n",
       "14529929       Anchor & Chief Washington Correspondent      M   \n",
       "15486163               President & Chief Correspondent      M   \n",
       "19576571           White House Correspondent, SXMPOTUS      M   \n",
       "2453025128                  Journalist and Radio Host       F   \n",
       "\n",
       "            followers_count  following_count  tweet_count  \\\n",
       "user_id                                                     \n",
       "456994513              1039              530        38464   \n",
       "22891564              78631             4767       201131   \n",
       "21810329              11693             5428        75733   \n",
       "19545932               6901             1952        50954   \n",
       "47408060              11126             1093        78318   \n",
       "3817401               52569              732       201279   \n",
       "593813785              5654             1621        46571   \n",
       "104299137             32966             2475       101229   \n",
       "61734492             419647             3341        25457   \n",
       "13524182             318915            10169       166821   \n",
       "25702314               2960             3748        42198   \n",
       "18825339              16181             2118        95033   \n",
       "21612122              49151             1456       152116   \n",
       "21696279              71586              722        96050   \n",
       "16459325              20241             6826        88797   \n",
       "42352386               4426             1910        34044   \n",
       "304988603             18991             7699        57983   \n",
       "191964162              2302             2164        90023   \n",
       "259395895            139370             1157        73724   \n",
       "14529929            1238317             5664       144300   \n",
       "15486163               7622             3632        39421   \n",
       "19576571              12277             5924        38049   \n",
       "2453025128              468              232        45438   \n",
       "\n",
       "                           user_created_at verified protected  \\\n",
       "user_id                                                         \n",
       "456994513   Fri Jan 06 22:22:40 +0000 2012    False     False   \n",
       "22891564    Thu Mar 05 06:48:00 +0000 2009     True     False   \n",
       "21810329    Tue Feb 24 23:10:17 +0000 2009     True     False   \n",
       "19545932    Mon Jan 26 17:37:58 +0000 2009    False     False   \n",
       "47408060    Mon Jun 15 18:42:47 +0000 2009     True     False   \n",
       "3817401     Sun Apr 08 20:27:11 +0000 2007     True     False   \n",
       "593813785   Tue May 29 15:45:45 +0000 2012    False     False   \n",
       "104299137   Tue Jan 12 22:56:50 +0000 2010     True     False   \n",
       "61734492    Fri Jul 31 09:29:37 +0000 2009     True     False   \n",
       "13524182    Fri Feb 15 17:58:23 +0000 2008     True     False   \n",
       "25702314    Sat Mar 21 17:44:40 +0000 2009    False     False   \n",
       "18825339    Sat Jan 10 03:19:50 +0000 2009     True     False   \n",
       "21612122    Sun Feb 22 23:45:46 +0000 2009     True     False   \n",
       "21696279    Mon Feb 23 21:31:16 +0000 2009     True     False   \n",
       "16459325    Thu Sep 25 22:43:36 +0000 2008     True     False   \n",
       "42352386    Mon May 25 04:52:44 +0000 2009     True     False   \n",
       "304988603   Wed May 25 13:09:32 +0000 2011    False     False   \n",
       "191964162   Fri Sep 17 20:37:31 +0000 2010    False     False   \n",
       "259395895   Tue Mar 01 20:49:40 +0000 2011     True     False   \n",
       "14529929    Fri Apr 25 17:23:28 +0000 2008     True     False   \n",
       "15486163    Fri Jul 18 20:45:38 +0000 2008    False     False   \n",
       "19576571    Tue Jan 27 04:09:53 +0000 2009     True     False   \n",
       "2453025128  Sat Apr 19 12:03:52 +0000 2014    False     False   \n",
       "\n",
       "                       original                quote                reply  \\\n",
       "user_id                                                                     \n",
       "456994513                261.00               291.00               108.00   \n",
       "22891564                 592.00               475.00             2,850.00   \n",
       "21810329                 203.00               374.00               152.00   \n",
       "19545932                 506.00               349.00               202.00   \n",
       "47408060                 418.00                41.00                70.00   \n",
       "3817401                  820.00             1,435.00             7,328.00   \n",
       "593813785              1,095.00               885.00                 9.00   \n",
       "104299137                611.00             1,122.00               517.00   \n",
       "61734492                 115.00               142.00                63.00   \n",
       "13524182                 712.00               784.00               242.00   \n",
       "25702314                 441.00             1,188.00               575.00   \n",
       "18825339               1,205.00             1,440.00               279.00   \n",
       "21612122                 395.00               643.00               338.00   \n",
       "21696279                 475.00               546.00               714.00   \n",
       "16459325                 843.00               529.00               753.00   \n",
       "42352386                 122.00               590.00                56.00   \n",
       "304988603                682.00               616.00               227.00   \n",
       "191964162                759.00               206.00               430.00   \n",
       "259395895                825.00               487.00                83.00   \n",
       "14529929               1,162.00               266.00               645.00   \n",
       "15486163               1,191.00               613.00               189.00   \n",
       "19576571                 645.00               858.00             1,393.00   \n",
       "2453025128                 0.00                 0.00                 1.00   \n",
       "\n",
       "                        retweet    tweets_in_dataset tweets_in_dataset_bin  \n",
       "user_id                                                                     \n",
       "456994513              3,204.00             3,864.00                Top 1%  \n",
       "22891564                 750.00             4,667.00                Top 1%  \n",
       "21810329               2,792.00             3,521.00                Top 1%  \n",
       "19545932               2,027.00             3,084.00                Top 1%  \n",
       "47408060               2,352.00             2,881.00                Top 1%  \n",
       "3817401                    0.00             9,583.00                Top 1%  \n",
       "593813785              1,169.00             3,158.00                Top 1%  \n",
       "104299137                934.00             3,184.00                Top 1%  \n",
       "61734492               2,333.00             2,653.00                Top 1%  \n",
       "13524182               2,155.00             3,893.00                Top 1%  \n",
       "25702314                 405.00             2,609.00                Top 1%  \n",
       "18825339               3,459.00             6,383.00                Top 1%  \n",
       "21612122               4,302.00             5,678.00                Top 1%  \n",
       "21696279               2,122.00             3,857.00                Top 1%  \n",
       "16459325               1,778.00             3,903.00                Top 1%  \n",
       "42352386               2,206.00             2,974.00                Top 1%  \n",
       "304988603              4,444.00             5,969.00                Top 1%  \n",
       "191964162              5,331.00             6,726.00                Top 1%  \n",
       "259395895              3,307.00             4,702.00                Top 1%  \n",
       "14529929               1,295.00             3,368.00                Top 1%  \n",
       "15486163               1,017.00             3,010.00                Top 1%  \n",
       "19576571               2,050.00             4,946.00                Top 1%  \n",
       "2453025128             8,855.00             8,856.00                Top 1%  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>original</th>\n",
       "      <th>quote</th>\n",
       "      <th>reply</th>\n",
       "      <th>retweet</th>\n",
       "      <th>tweets_in_dataset</th>\n",
       "      <th>percent_of_original</th>\n",
       "      <th>percent_of_quote</th>\n",
       "      <th>percent_of_reply</th>\n",
       "      <th>percent_of_retweets</th>\n",
       "      <th>percent_of_tweets_in_dataset</th>\n",
       "      <th>users_in_bin</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tweets_in_dataset_bin</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Bottom 90%</th>\n",
       "      <td>118,274.00</td>\n",
       "      <td>36,419.00</td>\n",
       "      <td>31,546.00</td>\n",
       "      <td>116,400.00</td>\n",
       "      <td>302,639.00</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.44</td>\n",
       "      <td>0.34</td>\n",
       "      <td>0.43</td>\n",
       "      <td>0.47</td>\n",
       "      <td>2043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Middle 9%</th>\n",
       "      <td>65,947.00</td>\n",
       "      <td>33,018.00</td>\n",
       "      <td>43,692.00</td>\n",
       "      <td>97,456.00</td>\n",
       "      <td>240,113.00</td>\n",
       "      <td>0.33</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.47</td>\n",
       "      <td>0.36</td>\n",
       "      <td>0.37</td>\n",
       "      <td>206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Top 1%</th>\n",
       "      <td>14,078.00</td>\n",
       "      <td>13,880.00</td>\n",
       "      <td>17,224.00</td>\n",
       "      <td>58,287.00</td>\n",
       "      <td>103,469.00</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.16</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                  original                quote  \\\n",
       "tweets_in_dataset_bin                                             \n",
       "Bottom 90%                      118,274.00            36,419.00   \n",
       "Middle 9%                        65,947.00            33,018.00   \n",
       "Top 1%                           14,078.00            13,880.00   \n",
       "\n",
       "                                     reply              retweet  \\\n",
       "tweets_in_dataset_bin                                             \n",
       "Bottom 90%                       31,546.00           116,400.00   \n",
       "Middle 9%                        43,692.00            97,456.00   \n",
       "Top 1%                           17,224.00            58,287.00   \n",
       "\n",
       "                         tweets_in_dataset  percent_of_original  \\\n",
       "tweets_in_dataset_bin                                             \n",
       "Bottom 90%                      302,639.00                 0.60   \n",
       "Middle 9%                       240,113.00                 0.33   \n",
       "Top 1%                          103,469.00                 0.07   \n",
       "\n",
       "                          percent_of_quote     percent_of_reply  \\\n",
       "tweets_in_dataset_bin                                             \n",
       "Bottom 90%                            0.44                 0.34   \n",
       "Middle 9%                             0.40                 0.47   \n",
       "Top 1%                                0.17                 0.19   \n",
       "\n",
       "                       percent_of_retweets  percent_of_tweets_in_dataset  \\\n",
       "tweets_in_dataset_bin                                                      \n",
       "Bottom 90%                            0.43                          0.47   \n",
       "Middle 9%                             0.36                          0.37   \n",
       "Top 1%                                0.21                          0.16   \n",
       "\n",
       "                       users_in_bin  \n",
       "tweets_in_dataset_bin                \n",
       "Bottom 90%                     2043  \n",
       "Middle 9%                       206  \n",
       "Top 1%                           23  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum()\n",
    "tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum()\n",
    "tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum()\n",
    "tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum()\n",
    "tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum()\n",
    "tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum()\n",
    "tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count()\n",
    "tweets_in_dataset_bin_summary_df\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## User summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>followers_count</th>\n",
       "      <th>following_count</th>\n",
       "      <th>tweet_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2,484.00</td>\n",
       "      <td>2,484.00</td>\n",
       "      <td>2,484.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>14,644.39</td>\n",
       "      <td>1,344.52</td>\n",
       "      <td>8,760.62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>84,477.36</td>\n",
       "      <td>2,805.21</td>\n",
       "      <td>15,836.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>659.00</td>\n",
       "      <td>428.00</td>\n",
       "      <td>1,001.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2,114.00</td>\n",
       "      <td>933.00</td>\n",
       "      <td>3,578.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6,611.00</td>\n",
       "      <td>1,621.50</td>\n",
       "      <td>9,572.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2,133,806.00</td>\n",
       "      <td>94,689.00</td>\n",
       "      <td>201,279.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           followers_count      following_count          tweet_count\n",
       "count             2,484.00             2,484.00             2,484.00\n",
       "mean             14,644.39             1,344.52             8,760.62\n",
       "std              84,477.36             2,805.21            15,836.17\n",
       "min                   0.00                 0.00                 0.00\n",
       "25%                 659.00               428.00             1,001.25\n",
       "50%               2,114.00               933.00             3,578.00\n",
       "75%               6,611.00             1,621.50             9,572.00\n",
       "max           2,133,806.00            94,689.00           201,279.00"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "M    1398\n",
       "F    1085\n",
       "Name: gender, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_summary_df['gender'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Organization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top by average followers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">followers_count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>organization</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>MSNBC</th>\n",
       "      <td>1732992</td>\n",
       "      <td>7</td>\n",
       "      <td>247,570.29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Toronto Star</th>\n",
       "      <td>165056</td>\n",
       "      <td>1</td>\n",
       "      <td>165,056.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York</th>\n",
       "      <td>125754</td>\n",
       "      <td>1</td>\n",
       "      <td>125,754.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New Yorker</th>\n",
       "      <td>125180</td>\n",
       "      <td>1</td>\n",
       "      <td>125,180.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MTV News</th>\n",
       "      <td>101473</td>\n",
       "      <td>1</td>\n",
       "      <td>101,473.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             followers_count                          \n",
       "                         sum size              average\n",
       "organization                                          \n",
       "MSNBC                1732992    7           247,570.29\n",
       "Toronto Star          165056    1           165,056.00\n",
       "New York              125754    1           125,754.00\n",
       "New Yorker            125180    1           125,180.00\n",
       "MTV News              101473    1           101,473.00"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top by average following"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">following_count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>organization</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>White House Dossier</th>\n",
       "      <td>7441</td>\n",
       "      <td>1</td>\n",
       "      <td>7,441.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Snapchat</th>\n",
       "      <td>6019</td>\n",
       "      <td>1</td>\n",
       "      <td>6,019.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bankrate</th>\n",
       "      <td>5853</td>\n",
       "      <td>1</td>\n",
       "      <td>5,853.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York Daily News</th>\n",
       "      <td>4288</td>\n",
       "      <td>1</td>\n",
       "      <td>4,288.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Texas Tribune</th>\n",
       "      <td>3935</td>\n",
       "      <td>1</td>\n",
       "      <td>3,935.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    following_count                          \n",
       "                                sum size              average\n",
       "organization                                                 \n",
       "White House Dossier            7441    1             7,441.00\n",
       "Snapchat                       6019    1             6,019.00\n",
       "Bankrate                       5853    1             5,853.00\n",
       "New York Daily News            4288    1             4,288.00\n",
       "Texas Tribune                  3935    1             3,935.00"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top by average tweet count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">tweet_count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>organization</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>New Republic</th>\n",
       "      <td>96050</td>\n",
       "      <td>1</td>\n",
       "      <td>96,050.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mic</th>\n",
       "      <td>95033</td>\n",
       "      <td>1</td>\n",
       "      <td>95,033.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yahoo News</th>\n",
       "      <td>93714</td>\n",
       "      <td>1</td>\n",
       "      <td>93,714.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MTV News</th>\n",
       "      <td>80962</td>\n",
       "      <td>1</td>\n",
       "      <td>80,962.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ProPublica</th>\n",
       "      <td>78207</td>\n",
       "      <td>1</td>\n",
       "      <td>78,207.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             tweet_count                          \n",
       "                     sum size              average\n",
       "organization                                      \n",
       "New Republic       96050    1            96,050.00\n",
       "Mic                95033    1            95,033.00\n",
       "Yahoo News         93714    1            93,714.00\n",
       "MTV News           80962    1            80,962.00\n",
       "ProPublica         78207    1            78,207.00"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Top by number of tweets in dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">tweets_in_dataset</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>size</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>organization</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Politico</th>\n",
       "      <td>43,669.00</td>\n",
       "      <td>103.00</td>\n",
       "      <td>423.97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CNN</th>\n",
       "      <td>33,868.00</td>\n",
       "      <td>149.00</td>\n",
       "      <td>227.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Washington Post</th>\n",
       "      <td>22,621.00</td>\n",
       "      <td>60.00</td>\n",
       "      <td>377.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bloomberg News</th>\n",
       "      <td>17,558.00</td>\n",
       "      <td>75.00</td>\n",
       "      <td>234.11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CBS News</th>\n",
       "      <td>17,036.00</td>\n",
       "      <td>61.00</td>\n",
       "      <td>279.28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   tweets_in_dataset                                          \n",
       "                                 sum                 size              average\n",
       "organization                                                                  \n",
       "Politico                   43,669.00               103.00               423.97\n",
       "CNN                        33,868.00               149.00               227.30\n",
       "Washington Post            22,621.00                60.00               377.02\n",
       "Bloomberg News             17,558.00                75.00               234.11\n",
       "CBS News                   17,036.00                61.00               279.28"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First tweet for each user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tweet_id            2293\n",
       "screen_name         2293\n",
       "tweet_created_at    2293\n",
       "user_created_at     2293\n",
       "tweets_to_date      2293\n",
       "tweet_type          2293\n",
       "dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get the first tweet for each user\n",
    "first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])\n",
    "first_tweet_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweet_id</th>\n",
       "      <th>screen_name</th>\n",
       "      <th>tweet_created_at</th>\n",
       "      <th>user_created_at</th>\n",
       "      <th>tweets_to_date</th>\n",
       "      <th>tweet_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>16338087</th>\n",
       "      <td>876092563563958272</td>\n",
       "      <td>AbbyDanzig</td>\n",
       "      <td>2017-06-17 15:01:58+00:00</td>\n",
       "      <td>2008-09-17 22:10:27+00:00</td>\n",
       "      <td>1542</td>\n",
       "      <td>retweet</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3901972468</th>\n",
       "      <td>875730040750604288</td>\n",
       "      <td>jchamseddine10</td>\n",
       "      <td>2017-06-16 15:01:26+00:00</td>\n",
       "      <td>2015-10-08 18:44:17+00:00</td>\n",
       "      <td>605</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198935531</th>\n",
       "      <td>875477217895231488</td>\n",
       "      <td>CarrieStevenson</td>\n",
       "      <td>2017-06-15 22:16:48+00:00</td>\n",
       "      <td>2010-10-05 16:30:31+00:00</td>\n",
       "      <td>438</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267210696</th>\n",
       "      <td>875005803283050496</td>\n",
       "      <td>PeteBehrEENews</td>\n",
       "      <td>2017-06-14 15:03:34+00:00</td>\n",
       "      <td>2011-03-16 14:28:09+00:00</td>\n",
       "      <td>24</td>\n",
       "      <td>original</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>425112739</th>\n",
       "      <td>874967586085244930</td>\n",
       "      <td>jzieglerWTOP</td>\n",
       "      <td>2017-06-14 12:31:43+00:00</td>\n",
       "      <td>2011-11-30 15:37:28+00:00</td>\n",
       "      <td>815</td>\n",
       "      <td>retweet</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      tweet_id      screen_name          tweet_created_at  \\\n",
       "user_id                                                                     \n",
       "16338087    876092563563958272       AbbyDanzig 2017-06-17 15:01:58+00:00   \n",
       "3901972468  875730040750604288   jchamseddine10 2017-06-16 15:01:26+00:00   \n",
       "198935531   875477217895231488  CarrieStevenson 2017-06-15 22:16:48+00:00   \n",
       "267210696   875005803283050496   PeteBehrEENews 2017-06-14 15:03:34+00:00   \n",
       "425112739   874967586085244930     jzieglerWTOP 2017-06-14 12:31:43+00:00   \n",
       "\n",
       "                     user_created_at  tweets_to_date tweet_type  \n",
       "user_id                                                          \n",
       "16338087   2008-09-17 22:10:27+00:00            1542    retweet  \n",
       "3901972468 2015-10-08 18:44:17+00:00             605   original  \n",
       "198935531  2010-10-05 16:30:31+00:00             438   original  \n",
       "267210696  2011-03-16 14:28:09+00:00              24   original  \n",
       "425112739  2011-11-30 15:37:28+00:00             815    retweet  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_tweet_df.sort_values('tweet_created_at', ascending=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Most recent first tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('2017-06-17 15:01:58+0000', tz='UTC')"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_tweet_df['tweet_created_at'].max()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}