# Tweet summary

## Prepare the tweet data

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                           'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_002.json.gz
INFO:root:Loading from tweets/7bff8603fb4a49d5953197361d548346_001.json.gz
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
INFO:root:Loading from tweets/b3f330f5b6cc4572b6d7dabc3752b2b9_001.json.gz
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000


tweet_id            650350
user_id             650350
screen_name         650350
tweet_created_at    650350
user_created_at     650350
tweets_to_date      650350
tweet_type          650350
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,859463382042378240,2343897943,AmberCStrong,2017-05-02 17:43:32+00:00,2014-02-14 17:33:36+00:00,1701,original
1,859803200152588288,307982591,JaxAlemany,2017-05-03 16:13:51+00:00,2011-05-30 16:43:13+00:00,6328,original
2,859788527705493504,307982591,JaxAlemany,2017-05-03 15:15:33+00:00,2011-05-30 16:43:13+00:00,6328,quote
3,859788479076732930,307982591,JaxAlemany,2017-05-03 15:15:22+00:00,2011-05-30 16:43:13+00:00,6328,original
4,859781841955500032,307982591,JaxAlemany,2017-05-03 14:48:59+00:00,2011-05-30 16:43:13+00:00,6328,retweet


## Prepare the user data

### Tweets in dataset for each user

In [3]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])
user_tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001991865,12.0,1.0,3.0,35.0,51.0,Bottom 90%
1002229862,35.0,5.0,2.0,99.0,141.0,Bottom 90%
100802089,4.0,3.0,5.0,12.0,24.0,Bottom 90%
100860790,117.0,19.0,9.0,215.0,360.0,Bottom 90%
1009749229,79.0,85.0,34.0,156.0,354.0,Bottom 90%


### Load and join user info
This is information that was coded in the spreadsheet or looked up for each user via API.

In [4]:
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position',
                                            'gender', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

screen_name        2484
name               2484
organization       2455
position           2481
gender             2483
followers_count    2484
following_count    2484
tweet_count        2484
user_created_at    2484
verified           2484
protected          2484
dtype: int64

In [5]:
user_info_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20711445,ninglin,"Glinski, Nina",,Freelance Reporter,F,968,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False
258917371,davidjenders,"Enders, David",,Journalist,M,1451,480,6299,Mon Feb 28 19:52:03 +0000 2011,True,False
297046834,mattbarakat,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,754,349,620,Wed May 11 20:55:24 +0000 2011,True,False
455585786,kimberlyeatkins,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2399,2661,5846,Thu Jan 05 08:26:46 +0000 2012,True,False
42584840,toulavlahou,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2713,198,6325,Tue May 26 07:41:38 +0000 2009,False,False


In [6]:
# Join
user_summary_df = user_info_df.join(user_tweet_count_df, how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()

screen_name              2484
name                     2484
organization             2484
position                 2481
gender                   2483
followers_count          2484
following_count          2484
tweet_count              2484
user_created_at          2484
verified                 2484
protected                2484
original                 2484
quote                    2484
reply                    2484
retweet                  2484
tweets_in_dataset        2484
tweets_in_dataset_bin    2272
dtype: int64

In [7]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
20711445,ninglin,"Glinski, Nina",,Freelance Reporter,F,968,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False,0.0,0.0,0.0,0.0,0.0,
258917371,davidjenders,"Enders, David",,Journalist,M,1451,480,6299,Mon Feb 28 19:52:03 +0000 2011,True,False,0.0,0.0,0.0,0.0,0.0,
297046834,mattbarakat,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,754,349,620,Wed May 11 20:55:24 +0000 2011,True,False,12.0,0.0,0.0,2.0,14.0,Bottom 90%
455585786,kimberlyeatkins,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2399,2661,5846,Thu Jan 05 08:26:46 +0000 2012,True,False,228.0,144.0,39.0,196.0,607.0,Bottom 90%
42584840,toulavlahou,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2713,198,6325,Tue May 26 07:41:38 +0000 2009,False,False,32.0,25.0,0.0,25.0,82.0,Bottom 90%


### Write to file as output/user_summary.csv

In [8]:
user_summary_df.to_csv('output/user_summary.csv')

## Prepare the organization data
This is for users that are members of each organization.

In [9]:
org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average])
org_summary_df.count()

followers_count    sum        347
                   size       347
                   average    347
following_count    sum        347
                   size       347
                   average    347
tweet_count        sum        347
                   size       347
                   average    347
tweets_in_dataset  sum        347
                   size       347
                   average    347
dtype: int64

In [10]:
org_summary_df.head()

Unnamed: 0_level_0,followers_count,followers_count,followers_count,following_count,following_count,following_count,tweet_count,tweet_count,tweet_count,tweets_in_dataset,tweets_in_dataset,tweets_in_dataset
Unnamed: 0_level_1,sum,size,average,sum,size,average,sum,size,average,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
,57347,29,1977.48,30788,29,1061.66,151441,29,5222.1,2767.0,29.0,95.41
ABC 7,889,1,889.0,1092,1,1092.0,1946,1,1946.0,464.0,1.0,464.0
ABC News,602790,52,11592.12,72154,52,1387.58,372200,52,7157.69,8629.0,52.0,165.94
AP–Broadcast,5305,15,353.67,7974,15,531.6,16794,15,1119.6,527.0,15.0,35.13
Afro American Newspapers,189,1,189.0,202,1,202.0,596,1,596.0,14.0,1.0,14.0


### Write to file as output/organization_summary.csv

In [12]:
org_summary_df.to_csv('output/organization_summary.csv')

### List of organizations <--- This probably requires some cleanup

In [13]:
org_summary_df.index.tolist()

['',
 'ABC 7',
 'ABC News',
 'AP–Broadcast',
 'Afro American Newspapers',
 'Agence France Presse (AFP–TV)',
 'Agence France-Presse',
 'Agri-Pulse',
 'Air Force Magazine',
 'Alaska Dispatch News',
 'Alaska Public Radio Network',
 'Albuquerque Journal',
 'Aljazeera America',
 'Aljazeera English',
 'Allentown Morning Call',
 'American Banker',
 'American Gaming Association',
 'American Prospect',
 'Argus Media',
 'Army Times',
 'Associated Press',
 'Atlanta Journal-Consitution',
 'Austin American-Statesman',
 'Axios',
 'BBC',
 'Baltimore Sun',
 'Bankrate',
 'Bloomberg BNA',
 'Bloomberg Government',
 'Bloomberg News',
 'Bloomberg TV',
 'Bond Buyer',
 'Boston Globe',
 'Boston Herald',
 'Breitbart News',
 'Broadcasting & Cable',
 'Buffalo News',
 'BuzzFeed',
 'Buzzfeed',
 'CBN News',
 'CBS News',
 'CDC Gaming Reports',
 'CEO Update',
 'CNBC',
 'CNN',
 'CNN International',
 'CNSNews.com',
 'CQ Researcher',
 'CQ Roll Call',
 'CRTV',
 'CTV–Community TV of PG County',
 'Canadian Press',
 'Carrol

## Tweet summary
For tweets in dataset.

### Types of tweets

In [14]:
tweet_df['tweet_type'].value_counts()

retweet     273412
original    199949
reply        93184
quote        83805
Name: tweet_type, dtype: int64

## User tweet summary

### Types of tweets in dataset for each user

In [15]:
user_summary_df[['original', 'quote', 'reply', 'retweet']].describe()

Unnamed: 0,original,quote,reply,retweet
count,2484.0,2484.0,2484.0,2484.0
mean,79.83,33.54,37.22,109.56
std,135.84,90.07,186.34,341.02
min,0.0,0.0,0.0,0.0
25%,5.0,0.0,0.0,3.0
50%,29.0,5.0,3.0,24.0
75%,99.0,28.0,18.0,94.25
max,1579.0,1440.0,7328.0,8855.0


### 1/9/90 rule
For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for.

In [16]:
user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%']

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
456994513,maria_e_recio,"Recio, Maria",Austin American-Statesman,Political Reporter,F,1039,530,38464,Fri Jan 06 22:22:40 +0000 2012,False,False,261.0,291.0,108.0,3204.0,3864.0,Top 1%
22891564,chrisgeidner,"Geidner, Chris",BuzzFeed,Legal Editor & Supreme Court Correspondent,M,78631,4767,201131,Thu Mar 05 06:48:00 +0000 2009,True,False,592.0,475.0,2850.0,750.0,4667.0,Top 1%
21810329,sdonnan,"Donnan, Shawn",Financial Times,Wolrd Trade Editor,M,11693,5428,75733,Tue Feb 24 23:10:17 +0000 2009,True,False,203.0,374.0,152.0,2792.0,3521.0,Top 1%
19545932,kampeas,"Kampeas, Ron",Jewish Telegraphic Agency,Washington Bureau Chief,M,6901,1952,50954,Mon Jan 26 17:37:58 +0000 2009,False,False,506.0,349.0,202.0,2027.0,3084.0,Top 1%
47408060,jonathanlanday,"Landay, Jonathan",McClatchy Newspapers,National Security Correspondent,M,11126,1093,78318,Mon Jun 15 18:42:47 +0000 2009,True,False,418.0,41.0,70.0,2352.0,2881.0,Top 1%
3817401,ericgeller,"Geller, Eric",Politico,Cybersecurity Reporter,M,52569,732,201279,Sun Apr 08 20:27:11 +0000 2007,True,False,820.0,1435.0,7328.0,0.0,9583.0,Top 1%
593813785,donnayoungdc,"Young, Donna",S&P Global Market Intelligence,Senior Reporter,F,5654,1621,46571,Tue May 29 15:45:45 +0000 2012,False,False,1095.0,885.0,9.0,1169.0,3158.0,Top 1%
104299137,davidmdrucker,"Drucker, David",Washington Examiner,Senior Political Correspondent,M,32966,2475,101229,Tue Jan 12 22:56:50 +0000 2010,True,False,611.0,1122.0,517.0,934.0,3184.0,Top 1%
61734492,fahrenthold,"Fahrenthold, David",Washington Post,Political Reporter,M,419647,3341,25457,Fri Jul 31 09:29:37 +0000 2009,True,False,115.0,142.0,63.0,2333.0,2653.0,Top 1%
13524182,daveweigel,"Weigel, David",Washington Post,Political Reporter,M,318915,10169,166821,Fri Feb 15 17:58:23 +0000 2008,True,False,712.0,784.0,242.0,2155.0,3893.0,Top 1%


In [17]:
tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum()
tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum()
tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum()
tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum()
tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum()
tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum()
tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count()
tweets_in_dataset_bin_summary_df


Unnamed: 0_level_0,original,quote,reply,retweet,tweets_in_dataset,percent_of_original,percent_of_quote,percent_of_reply,percent_of_retweets,percent_of_tweets_in_dataset,users_in_bin
tweets_in_dataset_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bottom 90%,118274.0,36419.0,31546.0,116400.0,302639.0,0.6,0.44,0.34,0.43,0.47,2043
Middle 9%,65947.0,33018.0,43692.0,97456.0,240113.0,0.33,0.4,0.47,0.36,0.37,206
Top 1%,14078.0,13880.0,17224.0,58287.0,103469.0,0.07,0.17,0.19,0.21,0.16,23


## User summary

In [18]:
user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe()

Unnamed: 0,followers_count,following_count,tweet_count
count,2484.0,2484.0,2484.0
mean,14644.39,1344.52,8760.62
std,84477.36,2805.21,15836.17
min,0.0,0.0,0.0
25%,659.0,428.0,1001.25
50%,2114.0,933.0,3578.0
75%,6611.0,1621.5,9572.0
max,2133806.0,94689.0,201279.0


### Gender

In [19]:
user_summary_df['gender'].value_counts()

M    1398
F    1085
Name: gender, dtype: int64

## Organization

### Top by average followers

In [20]:
org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,followers_count,followers_count,followers_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
MSNBC,1732992,7,247570.29
Toronto Star,165056,1,165056.0
New York,125754,1,125754.0
New Yorker,125180,1,125180.0
MTV News,101473,1,101473.0


### Top by average following

In [21]:
org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,following_count,following_count,following_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
White House Dossier,7441,1,7441.0
Snapchat,6019,1,6019.0
Bankrate,5853,1,5853.0
New York Daily News,4288,1,4288.0
Texas Tribune,3935,1,3935.0


### Top by average tweet count

In [22]:
org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,tweet_count,tweet_count,tweet_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
New Republic,96050,1,96050.0
Mic,95033,1,95033.0
Yahoo News,93714,1,93714.0
MTV News,80962,1,80962.0
ProPublica,78207,1,78207.0


### Top by number of tweets in dataset

In [23]:
org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head()

Unnamed: 0_level_0,tweets_in_dataset,tweets_in_dataset,tweets_in_dataset
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Politico,43669.0,103.0,423.97
CNN,33868.0,149.0,227.3
Washington Post,22621.0,60.0,377.02
Bloomberg News,17558.0,75.0,234.11
CBS News,17036.0,61.0,279.28


## First tweet for each user

In [24]:
# Get the first tweet for each user
first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

tweet_id            2293
screen_name         2293
tweet_created_at    2293
user_created_at     2293
tweets_to_date      2293
tweet_type          2293
dtype: int64

In [25]:
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()

Unnamed: 0_level_0,tweet_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
16338087,876092563563958272,AbbyDanzig,2017-06-17 15:01:58+00:00,2008-09-17 22:10:27+00:00,1542,retweet
3901972468,875730040750604288,jchamseddine10,2017-06-16 15:01:26+00:00,2015-10-08 18:44:17+00:00,605,original
198935531,875477217895231488,CarrieStevenson,2017-06-15 22:16:48+00:00,2010-10-05 16:30:31+00:00,438,original
267210696,875005803283050496,PeteBehrEENews,2017-06-14 15:03:34+00:00,2011-03-16 14:28:09+00:00,24,original
425112739,874967586085244930,jzieglerWTOP,2017-06-14 12:31:43+00:00,2011-11-30 15:37:28+00:00,815,retweet


### Most recent first tweet

In [26]:
first_tweet_df['tweet_created_at'].max()

Timestamp('2017-06-17 15:01:58+0000', tz='UTC')