{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Gender dynamics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tweet data prep" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the tweets" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz\n", "DEBUG:root:Loaded 50000\n", "DEBUG:root:Loaded 100000\n", "DEBUG:root:Loaded 150000\n", "DEBUG:root:Loaded 200000\n", "DEBUG:root:Loaded 250000\n", "INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz\n", "DEBUG:root:Loaded 300000\n", "DEBUG:root:Loaded 350000\n", "DEBUG:root:Loaded 400000\n", "DEBUG:root:Loaded 450000\n", "DEBUG:root:Loaded 500000\n", "INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz\n", "DEBUG:root:Loaded 550000\n", "DEBUG:root:Loaded 600000\n", "DEBUG:root:Loaded 650000\n", "DEBUG:root:Loaded 700000\n", "DEBUG:root:Loaded 750000\n", "DEBUG:root:Loaded 800000\n" ] }, { "data": { "text/plain": [ "tweet_id 817136\n", "user_id 817136\n", "screen_name 817136\n", "tweet_created_at 817136\n", "tweet_type 817136\n", "dtype: int64" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import numpy as np\n", "import logging\n", "from dateutil.parser import parse as date_parse\n", "from utils import load_tweet_df, tweet_type\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "logger = logging.getLogger()\n", "logger.setLevel(logging.DEBUG)\n", "\n", "# Set float format so doesn't display scientific notation\n", "pd.options.display.float_format = '{:20,.2f}'.format\n", "\n", "def tweet_transform(tweet):\n", " return {\n", " 'tweet_id': tweet['id_str'], \n", " 'tweet_created_at': date_parse(tweet['created_at']),\n", " 'user_id': tweet['user']['id_str'],\n", " 'screen_name': tweet['user']['screen_name'],\n", " 'tweet_type': tweet_type(tweet)\n", " }\n", "\n", "tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id'])\n", "tweet_df.count()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_iduser_idscreen_nametweet_created_attweet_type
0872631046088601600327862439jonathanvswan2017-06-08 01:47:08+00:00retweet
1872610483647516673327862439jonathanvswan2017-06-08 00:25:26+00:00retweet
2872609618626826240327862439jonathanvswan2017-06-08 00:22:00+00:00retweet
3872605974699311104327862439jonathanvswan2017-06-08 00:07:31+00:00retweet
4872603191518646276327862439jonathanvswan2017-06-07 23:56:27+00:00retweet
\n", "
" ], "text/plain": [ " tweet_id user_id screen_name tweet_created_at \\\n", "0 872631046088601600 327862439 jonathanvswan 2017-06-08 01:47:08+00:00 \n", "1 872610483647516673 327862439 jonathanvswan 2017-06-08 00:25:26+00:00 \n", "2 872609618626826240 327862439 jonathanvswan 2017-06-08 00:22:00+00:00 \n", "3 872605974699311104 327862439 jonathanvswan 2017-06-08 00:07:31+00:00 \n", "4 872603191518646276 327862439 jonathanvswan 2017-06-07 23:56:27+00:00 \n", "\n", " tweet_type \n", "0 retweet \n", "1 retweet \n", "2 retweet \n", "3 retweet \n", "4 retweet " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweet_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tweet analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What are the first and last tweets in the dataset?" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2017-06-01 04:00:01+0000', tz='UTC')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweet_df.tweet_created_at.min()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2017-08-01 03:59:58+0000', tz='UTC')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweet_df.tweet_created_at.max()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How many retweets, original tweets, replies, and quotes are in dataset?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentage
retweet34526642.3%
original23392628.6%
reply12625415.5%
quote11169013.7%
\n", "
" ], "text/plain": [ " count percentage\n", "retweet 345266 42.3%\n", "original 233926 28.6%\n", "reply 126254 15.5%\n", "quote 111690 13.7%" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'count':tweet_df.tweet_type.value_counts(), \n", " 'percentage':tweet_df.tweet_type.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tweeter data prep" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This comes from the following sources:\n", "1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists. Accounts that were suspended or deleted have been removed from this list. Also, this list will include users that did not tweet (i.e., have no tweets in dataset).\n", "2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup. Also, it may include the tweets of some users that were later excluded because their accounts were suspended or deleted or determined to not be beltway journalists.\n", "3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup.\n", "\n", "Thus, the tweeter data should include tweet and user info data only from users in the user lookup." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load user lookup" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "screen_name 2487\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_lookup_filepaths = ('lookups/senate_press_lookup.csv',\n", " 'lookups/periodical_press_lookup.csv',\n", " 'lookups/radio_and_television_lookup.csv')\n", "user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))\n", "user_lookup_df.set_index('Uid', inplace=True)\n", "user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)\n", "user_lookup_df.index.names = ['user_id']\n", "# Some users may be in multiple lists, so need to drop duplicates\n", "user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]\n", "\n", "user_lookup_df.count()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_name
user_id
23455653abettel
33919343AshleyRParker
18580432b_fung
399225358b_muzz
18834692becca_milfeld
\n", "
" ], "text/plain": [ " screen_name\n", "user_id \n", "23455653 abettel\n", "33919343 AshleyRParker\n", "18580432 b_fung\n", "399225358 b_muzz\n", "18834692 becca_milfeld" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_lookup_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tweets in dataset per tweeter" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tweet_type\n", "original 2292\n", "quote 2292\n", "reply 2292\n", "retweet 2292\n", "tweets_in_dataset 2292\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()\n", "user_tweet_count_df.fillna(0, inplace=True)\n", "user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet\n", "user_tweet_count_df.count()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_typeoriginalquotereplyretweettweets_in_dataset
user_id
100199186513.003.001.0031.0048.00
100222986248.0020.003.00118.00189.00
1002700541.000.000.000.001.00
1008020894.007.0012.0017.0040.00
100860790102.0026.004.00166.00298.00
\n", "
" ], "text/plain": [ "tweet_type original quote reply \\\n", "user_id \n", "1001991865 13.00 3.00 1.00 \n", "1002229862 48.00 20.00 3.00 \n", "100270054 1.00 0.00 0.00 \n", "100802089 4.00 7.00 12.00 \n", "100860790 102.00 26.00 4.00 \n", "\n", "tweet_type retweet tweets_in_dataset \n", "user_id \n", "1001991865 31.00 48.00 \n", "1002229862 118.00 189.00 \n", "100270054 0.00 1.00 \n", "100802089 17.00 40.00 \n", "100860790 166.00 298.00 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_tweet_count_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load user info" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name 2506\n", "organization 2477\n", "position 2503\n", "gender 2505\n", "followers_count 2506\n", "following_count 2506\n", "tweet_count 2506\n", "user_created_at 2506\n", "verified 2506\n", "protected 2506\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',\n", " 'gender', 'followers_count', 'following_count', 'tweet_count',\n", " 'user_created_at', 'verified', 'protected'],\n", " dtype={'user_id': str}).set_index(['user_id'])\n", "user_info_df.count()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameorganizationpositiongenderfollowers_countfollowing_counttweet_countuser_created_atverifiedprotected
user_id
20711445Glinski, NinaNaNFreelance ReporterF963507909Thu Feb 12 20:00:53 +0000 2009FalseFalse
258917371Enders, DavidNaNJournalistM14444846296Mon Feb 28 19:52:03 +0000 2011TrueFalse
297046834Barakat, MatthewAssociated PressNorthern Virginia CorrespondentM759352631Wed May 11 20:55:24 +0000 2011TrueFalse
455585786Atkins, KimberlyBoston HeraldChief Washington Reporter/ColumnistF294426916277Thu Jan 05 08:26:46 +0000 2012TrueFalse
42584840Vlahou, ToulaCQ Roll CallEditor & Podcast ProducerF27032016366Tue May 26 07:41:38 +0000 2009FalseFalse
\n", "
" ], "text/plain": [ " name organization \\\n", "user_id \n", "20711445 Glinski, Nina NaN \n", "258917371 Enders, David NaN \n", "297046834 Barakat, Matthew Associated Press \n", "455585786 Atkins, Kimberly Boston Herald \n", "42584840 Vlahou, Toula CQ Roll Call \n", "\n", " position gender followers_count \\\n", "user_id \n", "20711445 Freelance Reporter F 963 \n", "258917371 Journalist M 1444 \n", "297046834 Northern Virginia Correspondent M 759 \n", "455585786 Chief Washington Reporter/Columnist F 2944 \n", "42584840 Editor & Podcast Producer F 2703 \n", "\n", " following_count tweet_count user_created_at \\\n", "user_id \n", "20711445 507 909 Thu Feb 12 20:00:53 +0000 2009 \n", "258917371 484 6296 Mon Feb 28 19:52:03 +0000 2011 \n", "297046834 352 631 Wed May 11 20:55:24 +0000 2011 \n", "455585786 2691 6277 Thu Jan 05 08:26:46 +0000 2012 \n", "42584840 201 6366 Tue May 26 07:41:38 +0000 2009 \n", "\n", " verified protected \n", "user_id \n", "20711445 False False \n", "258917371 True False \n", "297046834 True False \n", "455585786 True False \n", "42584840 False False " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_info_df.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "screen_name 2487\n", "name 2487\n", "organization 2487\n", "position 2484\n", "gender 2486\n", "followers_count 2487\n", "following_count 2487\n", "tweet_count 2487\n", "user_created_at 2487\n", "verified 2487\n", "protected 2487\n", "original 2487\n", "quote 2487\n", "reply 2487\n", "retweet 2487\n", "tweets_in_dataset 2487\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')\n", "# Fill Nans\n", "user_summary_df['organization'].fillna('', inplace=True)\n", "user_summary_df['original'].fillna(0, inplace=True)\n", "user_summary_df['quote'].fillna(0, inplace=True)\n", "user_summary_df['reply'].fillna(0, inplace=True)\n", "user_summary_df['retweet'].fillna(0, inplace=True)\n", "user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)\n", "user_summary_df.count()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationpositiongenderfollowers_countfollowing_counttweet_countuser_created_atverifiedprotectedoriginalquotereplyretweettweets_in_dataset
user_id
23455653abettelBettelheim, AdrielPoliticoHealth Care EditorF2664105515990Mon Mar 09 16:32:20 +0000 2009TrueFalse289.0012.006.0052.00359.00
33919343AshleyRParkerParker, AshleyWashington PostWhite House ReporterF122382234212433Tue Apr 21 14:28:57 +0000 2009TrueFalse172.0067.0011.00120.00370.00
18580432b_fungFung, BrianWashington PostTech ReporterM16558206244799Sat Jan 03 15:15:57 +0000 2009TrueFalse257.0085.00205.0082.00629.00
399225358b_muzzMurray, BrendanBloomberg NewsManaging Editor, U.S. EconomyM624382360Thu Oct 27 05:34:05 +0000 2011TrueFalse3.000.000.005.008.00
18834692becca_milfeldMilfeld, BeccaAgence France-PresseEnglish Desk Editor and JournalistF4839931484Sat Jan 10 13:58:43 +0000 2009FalseFalse3.0014.000.007.0024.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "23455653 abettel Bettelheim, Adriel Politico \n", "33919343 AshleyRParker Parker, Ashley Washington Post \n", "18580432 b_fung Fung, Brian Washington Post \n", "399225358 b_muzz Murray, Brendan Bloomberg News \n", "18834692 becca_milfeld Milfeld, Becca Agence France-Presse \n", "\n", " position gender followers_count \\\n", "user_id \n", "23455653 Health Care Editor F 2664 \n", "33919343 White House Reporter F 122382 \n", "18580432 Tech Reporter M 16558 \n", "399225358 Managing Editor, U.S. Economy M 624 \n", "18834692 English Desk Editor and Journalist F 483 \n", "\n", " following_count tweet_count user_created_at \\\n", "user_id \n", "23455653 1055 15990 Mon Mar 09 16:32:20 +0000 2009 \n", "33919343 2342 12433 Tue Apr 21 14:28:57 +0000 2009 \n", "18580432 2062 44799 Sat Jan 03 15:15:57 +0000 2009 \n", "399225358 382 360 Thu Oct 27 05:34:05 +0000 2011 \n", "18834692 993 1484 Sat Jan 10 13:58:43 +0000 2009 \n", "\n", " verified protected original quote \\\n", "user_id \n", "23455653 True False 289.00 12.00 \n", "33919343 True False 172.00 67.00 \n", "18580432 True False 257.00 85.00 \n", "399225358 True False 3.00 0.00 \n", "18834692 False False 3.00 14.00 \n", "\n", " reply retweet tweets_in_dataset \n", "user_id \n", "23455653 6.00 52.00 359.00 \n", "33919343 11.00 120.00 370.00 \n", "18580432 205.00 82.00 629.00 \n", "399225358 0.00 5.00 8.00 \n", "18834692 0.00 7.00 24.00 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df.head()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Remove users with no tweets in dataset" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "screen_name 195\n", "name 195\n", "organization 195\n", "position 195\n", "gender 194\n", "followers_count 195\n", "following_count 195\n", "tweet_count 195\n", "user_created_at 195\n", "verified 195\n", "protected 195\n", "original 195\n", "quote 195\n", "reply 195\n", "retweet 195\n", "tweets_in_dataset 195\n", "dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df[user_summary_df.tweets_in_dataset == 0].count()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "screen_name 2292\n", "name 2292\n", "organization 2292\n", "position 2289\n", "gender 2292\n", "followers_count 2292\n", "following_count 2292\n", "tweet_count 2292\n", "user_created_at 2292\n", "verified 2292\n", "protected 2292\n", "original 2292\n", "quote 2292\n", "reply 2292\n", "retweet 2292\n", "tweets_in_dataset 2292\n", "dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]\n", "user_summary_df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tweeter analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How many of the journalists are male / female?" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentage
M129956.7%
F99343.3%
\n", "
" ], "text/plain": [ " count percentage\n", "M 1299 56.7%\n", "F 993 43.3%" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})\n", "journalist_gender_summary_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summary\n", "\n", "* 25%, 50%, 75% are the percentiles. (Min is equivalent to 0%. Max is equivalent to 100%. 50% is the median.)\n", "* std is standard deviation, normalized by N-1." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### All" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
followers_countfollowing_counttweet_countoriginalquotereplyretweettweets_in_dataset
count2,292.002,292.002,292.002,292.002,292.002,292.002,292.002,292.00
mean16,467.621,444.839,619.69102.0648.7355.08150.64356.52
std91,886.903,003.0016,618.09169.43135.90249.18585.08833.76
min6.000.001.000.000.000.000.001.00
25%831.75505.751,449.5010.001.001.008.0032.00
50%2,419.50998.504,211.5041.009.005.0039.00122.00
75%7,348.751,713.5010,817.25124.2543.0030.00129.00375.00
max2,176,578.0096,194.00208,763.002,693.003,069.009,033.0021,524.0021,547.00
\n", "
" ], "text/plain": [ " followers_count following_count tweet_count \\\n", "count 2,292.00 2,292.00 2,292.00 \n", "mean 16,467.62 1,444.83 9,619.69 \n", "std 91,886.90 3,003.00 16,618.09 \n", "min 6.00 0.00 1.00 \n", "25% 831.75 505.75 1,449.50 \n", "50% 2,419.50 998.50 4,211.50 \n", "75% 7,348.75 1,713.50 10,817.25 \n", "max 2,176,578.00 96,194.00 208,763.00 \n", "\n", " original quote reply \\\n", "count 2,292.00 2,292.00 2,292.00 \n", "mean 102.06 48.73 55.08 \n", "std 169.43 135.90 249.18 \n", "min 0.00 0.00 0.00 \n", "25% 10.00 1.00 1.00 \n", "50% 41.00 9.00 5.00 \n", "75% 124.25 43.00 30.00 \n", "max 2,693.00 3,069.00 9,033.00 \n", "\n", " retweet tweets_in_dataset \n", "count 2,292.00 2,292.00 \n", "mean 150.64 356.52 \n", "std 585.08 833.76 \n", "min 0.00 1.00 \n", "25% 8.00 32.00 \n", "50% 39.00 122.00 \n", "75% 129.00 375.00 \n", "max 21,524.00 21,547.00 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df[['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Female" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
followers_countfollowing_counttweet_countoriginalquotereplyretweettweets_in_dataset
count993.00993.00993.00993.00993.00993.00993.00993.00
mean11,609.531,314.077,498.7483.8439.2732.06135.55290.72
std65,563.721,250.5611,312.72124.86135.0594.73724.92833.07
min6.001.001.000.000.000.000.001.00
25%825.00567.001,393.008.001.001.009.0032.00
50%2,327.001,034.004,055.0039.009.004.0037.00111.00
75%6,340.001,659.008,983.00111.0033.0021.00115.00314.00
max1,388,543.0018,197.00118,713.001,440.003,069.001,458.0021,524.0021,547.00
\n", "
" ], "text/plain": [ " followers_count following_count tweet_count \\\n", "count 993.00 993.00 993.00 \n", "mean 11,609.53 1,314.07 7,498.74 \n", "std 65,563.72 1,250.56 11,312.72 \n", "min 6.00 1.00 1.00 \n", "25% 825.00 567.00 1,393.00 \n", "50% 2,327.00 1,034.00 4,055.00 \n", "75% 6,340.00 1,659.00 8,983.00 \n", "max 1,388,543.00 18,197.00 118,713.00 \n", "\n", " original quote reply \\\n", "count 993.00 993.00 993.00 \n", "mean 83.84 39.27 32.06 \n", "std 124.86 135.05 94.73 \n", "min 0.00 0.00 0.00 \n", "25% 8.00 1.00 1.00 \n", "50% 39.00 9.00 4.00 \n", "75% 111.00 33.00 21.00 \n", "max 1,440.00 3,069.00 1,458.00 \n", "\n", " retweet tweets_in_dataset \n", "count 993.00 993.00 \n", "mean 135.55 290.72 \n", "std 724.92 833.07 \n", "min 0.00 1.00 \n", "25% 9.00 32.00 \n", "50% 37.00 111.00 \n", "75% 115.00 314.00 \n", "max 21,524.00 21,547.00 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df[user_summary_df.gender == 'F'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Male" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
followers_countfollowing_counttweet_countoriginalquotereplyretweettweets_in_dataset
count1,299.001,299.001,299.001,299.001,299.001,299.001,299.001,299.00
mean20,181.311,544.7811,241.02115.9955.9672.69162.17406.81
std107,635.373,833.8919,584.46195.72136.16319.41449.75831.10
min10.000.005.000.000.000.000.001.00
25%857.50472.001,477.0012.000.001.006.0033.00
50%2,498.00953.004,401.0044.009.006.0040.00131.00
75%8,341.501,763.0012,584.50140.0050.5038.50142.00428.00
max2,176,578.0096,194.00208,763.002,693.001,955.009,033.007,528.0011,432.00
\n", "
" ], "text/plain": [ " followers_count following_count tweet_count \\\n", "count 1,299.00 1,299.00 1,299.00 \n", "mean 20,181.31 1,544.78 11,241.02 \n", "std 107,635.37 3,833.89 19,584.46 \n", "min 10.00 0.00 5.00 \n", "25% 857.50 472.00 1,477.00 \n", "50% 2,498.00 953.00 4,401.00 \n", "75% 8,341.50 1,763.00 12,584.50 \n", "max 2,176,578.00 96,194.00 208,763.00 \n", "\n", " original quote reply \\\n", "count 1,299.00 1,299.00 1,299.00 \n", "mean 115.99 55.96 72.69 \n", "std 195.72 136.16 319.41 \n", "min 0.00 0.00 0.00 \n", "25% 12.00 0.00 1.00 \n", "50% 44.00 9.00 6.00 \n", "75% 140.00 50.50 38.50 \n", "max 2,693.00 1,955.00 9,033.00 \n", "\n", " retweet tweets_in_dataset \n", "count 1,299.00 1,299.00 \n", "mean 162.17 406.81 \n", "std 449.75 831.10 \n", "min 0.00 1.00 \n", "25% 6.00 33.00 \n", "50% 40.00 131.00 \n", "75% 142.00 428.00 \n", "max 7,528.00 11,432.00 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df[user_summary_df.gender == 'M'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Verified" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of all journalists, how many are verified?" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentage
True124054.1%
False105245.9%
\n", "
" ], "text/plain": [ " count percentage\n", "True 1240 54.1%\n", "False 1052 45.9%" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'count':user_summary_df.verified.value_counts(), 'percentage':user_summary_df.verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of female journalists, how many are verified?" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentage
True51251.6%
False48148.4%
\n", "
" ], "text/plain": [ " count percentage\n", "True 512 51.6%\n", "False 481 48.4%" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of male journalists, how many are verified?" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentage
True72856.0%
False57144.0%
\n", "
" ], "text/plain": [ " count percentage\n", "True 728 56.0%\n", "False 571 44.0%" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mention data prep" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load mentions from tweets\n", "Including original tweets only" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz\n", "DEBUG:root:Loaded 50000\n", "DEBUG:root:Loaded 100000\n", "DEBUG:root:Loaded 150000\n", "DEBUG:root:Loaded 200000\n", "DEBUG:root:Loaded 250000\n", "INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz\n", "DEBUG:root:Loaded 300000\n", "DEBUG:root:Loaded 350000\n", "DEBUG:root:Loaded 400000\n", "DEBUG:root:Loaded 450000\n", "DEBUG:root:Loaded 500000\n", "INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz\n", "DEBUG:root:Loaded 550000\n", "DEBUG:root:Loaded 600000\n", "DEBUG:root:Loaded 650000\n", "DEBUG:root:Loaded 700000\n", "DEBUG:root:Loaded 750000\n", "DEBUG:root:Loaded 800000\n" ] }, { "data": { "text/plain": [ "tweet_id 118210\n", "user_id 118210\n", "screen_name 118210\n", "mention_user_id 118210\n", "mention_screen_name 118210\n", "tweet_created_at 118210\n", "dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import numpy as np\n", "import logging\n", "from dateutil.parser import parse as date_parse\n", "from utils import load_tweet_df, tweet_type\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "logger = logging.getLogger()\n", "logger.setLevel(logging.DEBUG)\n", "\n", "# Set float format so doesn't display scientific notation\n", "pd.options.display.float_format = '{:20,.2f}'.format\n", "\n", "# Simply the tweet on load\n", "def mention_transform(tweet):\n", " mentions = []\n", " if tweet_type(tweet) == 'original':\n", " for mention in tweet.get('entities', {}).get('user_mentions', []):\n", " mentions.append({\n", " 'tweet_id': tweet['id_str'],\n", " 'user_id': tweet['user']['id_str'],\n", " 'screen_name': tweet['user']['screen_name'],\n", " 'mention_user_id': mention['id_str'],\n", " 'mention_screen_name': mention['screen_name'],\n", " 'tweet_created_at': date_parse(tweet['created_at'])\n", " })\n", " return mentions\n", "\n", "base_mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',\n", " 'mention_screen_name', 'tweet_created_at'], \n", " dedupe_columns=['tweet_id', 'mention_user_id'])\n", "base_mention_df.count()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_iduser_idscreen_namemention_user_idmention_screen_nametweet_created_at
0872522339962978307327862439jonathanvswan800707492346925056axios2017-06-07 18:35:11+00:00
1872484939530461184327862439jonathanvswan17494010SenSchumer2017-06-07 16:06:34+00:00
2872475140575170562327862439jonathanvswan2836421MSNBC2017-06-07 15:27:37+00:00
3872475140575170562327862439jonathanvswan800707492346925056axios2017-06-07 15:27:37+00:00
4872459457946673154327862439jonathanvswan800707492346925056axios2017-06-07 14:25:18+00:00
\n", "
" ], "text/plain": [ " tweet_id user_id screen_name mention_user_id \\\n", "0 872522339962978307 327862439 jonathanvswan 800707492346925056 \n", "1 872484939530461184 327862439 jonathanvswan 17494010 \n", "2 872475140575170562 327862439 jonathanvswan 2836421 \n", "3 872475140575170562 327862439 jonathanvswan 800707492346925056 \n", "4 872459457946673154 327862439 jonathanvswan 800707492346925056 \n", "\n", " mention_screen_name tweet_created_at \n", "0 axios 2017-06-07 18:35:11+00:00 \n", "1 SenSchumer 2017-06-07 16:06:34+00:00 \n", "2 MSNBC 2017-06-07 15:27:37+00:00 \n", "3 axios 2017-06-07 15:27:37+00:00 \n", "4 axios 2017-06-07 14:25:18+00:00 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_mention_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add gender of mentioner" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tweet_id 118210\n", "user_id 118210\n", "screen_name 118210\n", "mention_user_id 118210\n", "mention_screen_name 118210\n", "tweet_created_at 118210\n", "gender 118210\n", "dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mention_df = base_mention_df.join(user_summary_df['gender'], on='user_id')\n", "mention_df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How many tweets have mentions?" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "84942" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mention_df['tweet_id'].unique().size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How many users are mentioned? (All users, not just journalists)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17730" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mention_df['mention_user_id'].unique().size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Limit to mentions of journalists" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tweet_id 14298\n", "user_id 14298\n", "screen_name 14298\n", "mention_user_id 14298\n", "mention_screen_name 14298\n", "tweet_created_at 14298\n", "gender 14298\n", "mention_gender 14298\n", "dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mention_df = mention_df.join(user_summary_df['gender'], how='inner', on='mention_user_id', rsuffix='_mention')\n", "journalists_mention_df.rename(columns = {'gender_mention': 'mention_gender'}, inplace=True)\n", "journalists_mention_df.count()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_iduser_idscreen_namemention_user_idmention_screen_nametweet_created_atgendermention_gender
16870408075878027268327862439jonathanvswan16031927greta2017-06-01 22:33:51+00:00MF
28387258144986154189319847765sahilkapur16031927greta2017-06-07 22:30:04+00:00MF
220287257805591037132821252618JakeSherman16031927greta2017-06-07 22:16:34+00:00MF
1597788084106924362956870511174Hadas_Gold16031927greta2017-06-30 17:30:50+00:00FF
1725888018395201888666190077282politicoalex16031927greta2017-06-28 21:59:41+00:00MF
\n", "
" ], "text/plain": [ " tweet_id user_id screen_name mention_user_id \\\n", "16 870408075878027268 327862439 jonathanvswan 16031927 \n", "283 872581449861541893 19847765 sahilkapur 16031927 \n", "2202 872578055910371328 21252618 JakeSherman 16031927 \n", "15977 880841069243629568 70511174 Hadas_Gold 16031927 \n", "17258 880183952018886661 90077282 politicoalex 16031927 \n", "\n", " mention_screen_name tweet_created_at gender mention_gender \n", "16 greta 2017-06-01 22:33:51+00:00 M F \n", "283 greta 2017-06-07 22:30:04+00:00 M F \n", "2202 greta 2017-06-07 22:16:34+00:00 M F \n", "15977 greta 2017-06-30 17:30:50+00:00 F F \n", "17258 greta 2017-06-28 21:59:41+00:00 M F " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mention_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Functions for summarizing mentions by beltway journalists" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# Gender of beltway journalists mentioned by beltway journalists\n", "def journalist_mention_gender_summary(mention_df):\n", " gender_summary_df = pd.DataFrame({'count': mention_df.mention_gender.value_counts(), \n", " 'percentage': mention_df.mention_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})\n", " gender_summary_df.reset_index(inplace=True)\n", " gender_summary_df['avg_mentions'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) \n", " gender_summary_df.set_index('index', inplace=True, drop=True)\n", " return gender_summary_df\n", "\n", "def journalist_mention_summary(mention_df):\n", " # Mention count\n", " mention_count_df = pd.DataFrame(mention_df.mention_user_id.value_counts().rename('mention_count'))\n", "\n", " # Mentioning users. That is, the number of unique users mentioning each user.\n", " mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates()\n", " mentioning_user_count_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['mentioning_count'])\n", " mentioning_user_count_df.index.name = 'user_id'\n", "\n", " # Join with user summary\n", " journalist_mention_summary_df = user_summary_df.join([mention_count_df, mentioning_user_count_df])\n", " journalist_mention_summary_df.fillna(0, inplace=True)\n", " journalist_mention_summary_df = journalist_mention_summary_df.sort_values(['mention_count', 'mentioning_count', 'followers_count'], ascending=False)\n", " return journalist_mention_summary_df\n", "\n", "# Gender of top journalists mentioned by beltway journalists\n", "def top_journalist_mention_gender_summary(mention_summary_df, mentioning_count_threshold=0, head=100):\n", " top_mention_summary_df = mention_summary_df[mention_summary_df.mentioning_count > mentioning_count_threshold].head(head)\n", " return pd.DataFrame({'count': top_mention_summary_df.gender.value_counts(), \n", " 'percentage': top_mention_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})\n", "\n", "\n", "# Fields for displaying journalist mention summaries\n", "journalist_mention_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'mention_count', 'mentioning_count']\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mentioned analysis\n", "*Note that for each of these, the complete list is being written to CSV in the output directory.*\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Original tweets (since mentions are extracted from original tweets)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of the original tweets, how many were posted by male journalists / female journalists?" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originalpercentageavg_original
gender
F83,251.0035.6%83.84
M150,675.0064.4%115.99
\n", "
" ], "text/plain": [ " original percentage avg_original\n", "gender \n", "F 83,251.00 35.6% 83.84\n", "M 150,675.00 64.4% 115.99" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "original_tweets_by_gender_df = user_summary_df[['gender', 'original']].groupby('gender').sum()\n", "original_tweets_by_gender_df['percentage'] = original_tweets_by_gender_df.original.div(user_summary_df.original.sum()).mul(100).round(1).astype(str) + '%'\n", "original_tweets_by_gender_df.reset_index(inplace=True)\n", "original_tweets_by_gender_df['avg_original'] = original_tweets_by_gender_df.apply(lambda row: row['original'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1)\n", "original_tweets_by_gender_df.set_index('gender', inplace=True, drop=True)\n", "original_tweets_by_gender_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Who posted the most original tweets?" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_counttweet_countoriginaltweets_in_dataset
user_id
16187637ChadPergramPergram, ChadFox NewsM59305614612,693.002,693.00
31127446markknollerKnoller, MarkCBS NewsM3014741151321,858.002,089.00
16459325ryanbeckwithBeckwith, Ryan TeagueTime MagazineM20947922031,534.005,187.00
19580890LeeCampCamp, LeeRTTV AmericaM67601520511,517.003,708.00
18825339CahnEmilyCahn, EmilyMicF169801008031,440.008,196.00
593813785DonnaYoungDCYoung, DonnaS&P Global Market IntelligenceF5894499671,332.004,414.00
14529929jaketapperTapper, JakeCNNM13056801481431,316.005,078.00
21316253ZekeJMillerMiller, Zeke J.Time MagazineM1985171611481,271.002,106.00
36246939malbertnewsAlbert, MarkThe Voyage ReportM3575282301,078.001,151.00
117467779palbergoAlbergo, Paul F.Bloomberg BNAM1191180831,043.001,236.00
102171691rlocker12Locker, RayUSA TodayM3665411941,038.002,496.00
15486163SimonMarksFSNMarks, SimonFeature Story NewsM776741541984.003,432.00
275207082AlexParkerDCParker, Alexander M.Bloomberg BNAM3828142150972.003,983.00
190360266connorobrienNHO’Brien, ConnorPoliticoM615817242954.001,944.00
16031927gretaVan Susteren, GretaMSNBCF1186850116645907.004,792.00
300497193tackettdcTackett, R. MichaelNew York TimesM1685738620896.001,041.00
191964162SamLitzingerLitzinger, SamCBS NewsM232995236891.007,537.00
118130765dylanlscottScott, Dylan L.Stat NewsM2012242497885.003,960.00
3817401ericgellerGeller, EricPoliticoM58173208763871.0011,432.00
259395895JohnJHarwoodHarwood, JohnCNBCM14904078015846.006,377.00
27882000jamiedupreeDupree, JamieCox BroadcastingM14084846181841.002,108.00
407013776burgessevEverett, John B.PoliticoM3101027294836.001,673.00
104299137DavidMDruckerDrucker, DavidWashington ExaminerM35033104613824.004,907.00
63149389hbwxBernstein, HowardWUSA–TVM833748025822.001,604.00
13262862HowardMortmanMortman, HowardC–SPANM621138406819.001,289.00
\n", "
" ], "text/plain": [ " screen_name name \\\n", "user_id \n", "16187637 ChadPergram Pergram, Chad \n", "31127446 markknoller Knoller, Mark \n", "16459325 ryanbeckwith Beckwith, Ryan Teague \n", "19580890 LeeCamp Camp, Lee \n", "18825339 CahnEmily Cahn, Emily \n", "593813785 DonnaYoungDC Young, Donna \n", "14529929 jaketapper Tapper, Jake \n", "21316253 ZekeJMiller Miller, Zeke J. \n", "36246939 malbertnews Albert, Mark \n", "117467779 palbergo Albergo, Paul F. \n", "102171691 rlocker12 Locker, Ray \n", "15486163 SimonMarksFSN Marks, Simon \n", "275207082 AlexParkerDC Parker, Alexander M. \n", "190360266 connorobrienNH O’Brien, Connor \n", "16031927 greta Van Susteren, Greta \n", "300497193 tackettdc Tackett, R. Michael \n", "191964162 SamLitzinger Litzinger, Sam \n", "118130765 dylanlscott Scott, Dylan L. \n", "3817401 ericgeller Geller, Eric \n", "259395895 JohnJHarwood Harwood, John \n", "27882000 jamiedupree Dupree, Jamie \n", "407013776 burgessev Everett, John B. \n", "104299137 DavidMDrucker Drucker, David \n", "63149389 hbwx Bernstein, Howard \n", "13262862 HowardMortman Mortman, Howard \n", "\n", " organization gender followers_count \\\n", "user_id \n", "16187637 Fox News M 59305 \n", "31127446 CBS News M 301474 \n", "16459325 Time Magazine M 20947 \n", "19580890 RTTV America M 67601 \n", "18825339 Mic F 16980 \n", "593813785 S&P Global Market Intelligence F 5894 \n", "14529929 CNN M 1305680 \n", "21316253 Time Magazine M 198517 \n", "36246939 The Voyage Report M 3575 \n", "117467779 Bloomberg BNA M 1191 \n", "102171691 USA Today M 3665 \n", "15486163 Feature Story News M 7767 \n", "275207082 Bloomberg BNA M 3828 \n", "190360266 Politico M 6158 \n", "16031927 MSNBC F 1186850 \n", "300497193 New York Times M 16857 \n", "191964162 CBS News M 2329 \n", "118130765 Stat News M 20122 \n", "3817401 Politico M 58173 \n", "259395895 CNBC M 149040 \n", "27882000 Cox Broadcasting M 140848 \n", "407013776 Politico M 31010 \n", "104299137 Washington Examiner M 35033 \n", "63149389 WUSA–TV M 8337 \n", "13262862 C–SPAN M 6211 \n", "\n", " tweet_count original tweets_in_dataset \n", "user_id \n", "16187637 61461 2,693.00 2,693.00 \n", "31127446 115132 1,858.00 2,089.00 \n", "16459325 92203 1,534.00 5,187.00 \n", "19580890 52051 1,517.00 3,708.00 \n", "18825339 100803 1,440.00 8,196.00 \n", "593813785 49967 1,332.00 4,414.00 \n", "14529929 148143 1,316.00 5,078.00 \n", "21316253 161148 1,271.00 2,106.00 \n", "36246939 28230 1,078.00 1,151.00 \n", "117467779 18083 1,043.00 1,236.00 \n", "102171691 41194 1,038.00 2,496.00 \n", "15486163 41541 984.00 3,432.00 \n", "275207082 142150 972.00 3,983.00 \n", "190360266 17242 954.00 1,944.00 \n", "16031927 116645 907.00 4,792.00 \n", "300497193 38620 896.00 1,041.00 \n", "191964162 95236 891.00 7,537.00 \n", "118130765 42497 885.00 3,960.00 \n", "3817401 208763 871.00 11,432.00 \n", "259395895 78015 846.00 6,377.00 \n", "27882000 46181 841.00 2,108.00 \n", "407013776 27294 836.00 1,673.00 \n", "104299137 104613 824.00 4,907.00 \n", "63149389 48025 822.00 1,604.00 \n", "13262862 38406 819.00 1,289.00 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_summary_df[['screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'original', 'tweets_in_dataset']].sort_values(['original'], ascending=False).head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Mentions of all accounts (not just journalists)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists mentioning accounts, which are mentioned the most?\n", "This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mention_countmentioning_count
realDonaldTrump2876452
POTUS2265253
wusa9211141
AP1948143
USATODAY1235105
nbcwashington123070
WSJ1227152
dcexaminer103453
SHSanders45927148
nytimes829289
BloombergBNA75945
politico747181
SpeakerRyan700181
Scaramucci657198
PressSec654178
CNN628186
ABC7News60424
SenJohnMcCain599231
WTOP52943
BloombergLaw51715
VP506140
SteveScalise505150
MSNBC48692
Reuters48384
bpolitics43269
\n", "
" ], "text/plain": [ " mention_count mentioning_count\n", "realDonaldTrump 2876 452\n", "POTUS 2265 253\n", "wusa9 2111 41\n", "AP 1948 143\n", "USATODAY 1235 105\n", "nbcwashington 1230 70\n", "WSJ 1227 152\n", "dcexaminer 1034 53\n", "SHSanders45 927 148\n", "nytimes 829 289\n", "BloombergBNA 759 45\n", "politico 747 181\n", "SpeakerRyan 700 181\n", "Scaramucci 657 198\n", "PressSec 654 178\n", "CNN 628 186\n", "ABC7News 604 24\n", "SenJohnMcCain 599 231\n", "WTOP 529 43\n", "BloombergLaw 517 15\n", "VP 506 140\n", "SteveScalise 505 150\n", "MSNBC 486 92\n", "Reuters 483 84\n", "bpolitics 432 69" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mention count\n", "mention_count_screen_name_df = pd.DataFrame(mention_df.mention_screen_name.value_counts().rename('mention_count'))\n", "\n", "# Count of mentioning users\n", "mention_user_id_per_user_screen_name_df = mention_df[['mention_screen_name', 'user_id']].drop_duplicates()\n", "mentioning_count_screen_name_df = pd.DataFrame(mention_user_id_per_user_screen_name_df.groupby('mention_screen_name').size(), columns=['mentioning_count'])\n", "mentioning_count_screen_name_df.index.name = 'screen_name'\n", "\n", "all_mentioned_df = mention_count_screen_name_df.join(mentioning_count_screen_name_df)\n", "all_mentioned_df.to_csv('output/all_mentioned_by_journalists.csv')\n", "all_mentioned_df.head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Same, but ordered by the number of journalists mentioning the account" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mention_countmentioning_count
realDonaldTrump2876452
nytimes829289
POTUS2265253
SenJohnMcCain599231
Scaramucci657198
CNN628186
politico747181
SpeakerRyan700181
PressSec654178
washingtonpost413154
WSJ1227152
SteveScalise505150
SHSanders45927148
AP1948143
VP506140
SenateMajLdr412120
DonaldJTrumpJr199110
RandPaul206107
USATODAY1235105
LindseyGrahamSC253105
SenSchumer26597
NancyPelosi26695
MSNBC48692
CNNPolitics32991
MarkWarner20489
\n", "
" ], "text/plain": [ " mention_count mentioning_count\n", "realDonaldTrump 2876 452\n", "nytimes 829 289\n", "POTUS 2265 253\n", "SenJohnMcCain 599 231\n", "Scaramucci 657 198\n", "CNN 628 186\n", "politico 747 181\n", "SpeakerRyan 700 181\n", "PressSec 654 178\n", "washingtonpost 413 154\n", "WSJ 1227 152\n", "SteveScalise 505 150\n", "SHSanders45 927 148\n", "AP 1948 143\n", "VP 506 140\n", "SenateMajLdr 412 120\n", "DonaldJTrumpJr 199 110\n", "RandPaul 206 107\n", "USATODAY 1235 105\n", "LindseyGrahamSC 253 105\n", "SenSchumer 265 97\n", "NancyPelosi 266 95\n", "MSNBC 486 92\n", "CNNPolitics 329 91\n", "MarkWarner 204 89" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_mentioned_df.sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists mentioning journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists mentioning journalists, who is mentioned the most?" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
325050734AllysonRaeWxBanks, AllysonWUSA–TVF6918330.007.00
28496589TenaciousTopperShutt, CharlesWUSA–TVM15868239.0013.00
63149389hbwxBernstein, HowardWUSA–TVM8337235.0010.00
407013776burgessevEverett, John B.PoliticoM31010212.0046.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF20028200.0031.00
19186003seungminkimKim, Seung MinPoliticoF33980143.0041.00
14529929jaketapperTapper, JakeCNNM1305680127.0051.00
169586280WaPoSeanSullivan, SeanWashington PostM22860117.0020.00
997684836pkcapitolKane, PaulWashington PostM31300116.0047.00
108617810DanaBashCNNBash, DanaCNNF281861115.0055.00
82151660kelsey_snellSnell, KelseWashington PostF8108109.0022.00
123327472peterbakernytBaker, PeterNew York TimesM96956107.0043.00
13524182daveweigelWeigel, DavidWashington PostM332344106.0042.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM55762105.0027.00
15931637jonkarlKarl, JonathanABC NewsM183467104.0040.00
33919343AshleyRParkerParker, AshleyWashington PostF122382100.0031.00
9126752reporterjoeGould, Joseph M.Sightline Media GroupM470298.0016.00
39155029mkrajuRaju, Manu K.CNNM8836695.0043.00
52392666ZoeTillmanTillman, ZoeBuzzFeedF1524687.0014.00
16930125edatpostO’Keefe, EdwardWashington PostM5867084.0041.00
26632935HopeSeckHodge Seck, HopeMilitary.comF458483.003.00
48802204HardballChrisMatthews, ChrisNBC NewsM71833080.009.00
19107878GlennThrushThrush, Glenn H.New York TimesM30818178.0037.00
217550862BresPoliticoBresnahan, JohnPoliticoM4056278.0027.00
24439201jameshohmannHohmann, James P.Washington PostM3870878.0027.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "325050734 AllysonRaeWx Banks, Allyson WUSA–TV \n", "28496589 TenaciousTopper Shutt, Charles WUSA–TV \n", "63149389 hbwx Bernstein, Howard WUSA–TV \n", "407013776 burgessev Everett, John B. Politico \n", "16018516 jenhab Haberkorn, Jennifer A. Politico \n", "19186003 seungminkim Kim, Seung Min Politico \n", "14529929 jaketapper Tapper, Jake CNN \n", "169586280 WaPoSean Sullivan, Sean Washington Post \n", "997684836 pkcapitol Kane, Paul Washington Post \n", "108617810 DanaBashCNN Bash, Dana CNN \n", "82151660 kelsey_snell Snell, Kelse Washington Post \n", "123327472 peterbakernyt Baker, Peter New York Times \n", "13524182 daveweigel Weigel, David Washington Post \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News \n", "15931637 jonkarl Karl, Jonathan ABC News \n", "33919343 AshleyRParker Parker, Ashley Washington Post \n", "9126752 reporterjoe Gould, Joseph M. Sightline Media Group \n", "39155029 mkraju Raju, Manu K. CNN \n", "52392666 ZoeTillman Tillman, Zoe BuzzFeed \n", "16930125 edatpost O’Keefe, Edward Washington Post \n", "26632935 HopeSeck Hodge Seck, Hope Military.com \n", "48802204 HardballChris Matthews, Chris NBC News \n", "19107878 GlennThrush Thrush, Glenn H. New York Times \n", "217550862 BresPolitico Bresnahan, John Politico \n", "24439201 jameshohmann Hohmann, James P. Washington Post \n", "\n", " gender followers_count mention_count mentioning_count \n", "user_id \n", "325050734 F 6918 330.00 7.00 \n", "28496589 M 15868 239.00 13.00 \n", "63149389 M 8337 235.00 10.00 \n", "407013776 M 31010 212.00 46.00 \n", "16018516 F 20028 200.00 31.00 \n", "19186003 F 33980 143.00 41.00 \n", "14529929 M 1305680 127.00 51.00 \n", "169586280 M 22860 117.00 20.00 \n", "997684836 M 31300 116.00 47.00 \n", "108617810 F 281861 115.00 55.00 \n", "82151660 F 8108 109.00 22.00 \n", "123327472 M 96956 107.00 43.00 \n", "13524182 M 332344 106.00 42.00 \n", "46557945 M 55762 105.00 27.00 \n", "15931637 M 183467 104.00 40.00 \n", "33919343 F 122382 100.00 31.00 \n", "9126752 M 4702 98.00 16.00 \n", "39155029 M 88366 95.00 43.00 \n", "52392666 F 15246 87.00 14.00 \n", "16930125 M 58670 84.00 41.00 \n", "26632935 F 4584 83.00 3.00 \n", "48802204 M 718330 80.00 9.00 \n", "19107878 M 308181 78.00 37.00 \n", "217550862 M 40562 78.00 27.00 \n", "24439201 M 38708 78.00 27.00 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mention_summary_df = journalist_mention_summary(journalists_mention_df)\n", "journalists_mention_summary_df.to_csv('output/journalists_mentioned_by_journalists.csv')\n", "journalists_mention_summary_df[journalist_mention_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Same, but ordered by number of journalists mentioning" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
108617810DanaBashCNNBash, DanaCNNF281861115.0055.00
14529929jaketapperTapper, JakeCNNM1305680127.0051.00
997684836pkcapitolKane, PaulWashington PostM31300116.0047.00
407013776burgessevEverett, John B.PoliticoM31010212.0046.00
112526560kenvogelVogel, Kenneth P.PoliticoM5389467.0045.00
18227519morningmikaBrzezinski, MikaMSNBCF65303170.0044.00
123327472peterbakernytBaker, PeterNew York TimesM96956107.0043.00
39155029mkrajuRaju, Manu K.CNNM8836695.0043.00
13524182daveweigelWeigel, DavidWashington PostM332344106.0042.00
19186003seungminkimKim, Seung MinPoliticoF33980143.0041.00
16930125edatpostO’Keefe, EdwardWashington PostM5867084.0041.00
15931637jonkarlKarl, JonathanABC NewsM183467104.0040.00
22771961AcostaAcosta, JimCNNM35065061.0038.00
19107878GlennThrushThrush, Glenn H.New York TimesM30818178.0037.00
18678924jmartNYTMartin, JonathanNew York TimesM19732275.0037.00
61734492FahrentholdFahrenthold, DavidWashington PostM45177843.0032.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF20028200.0031.00
33919343AshleyRParkerParker, AshleyWashington PostF122382100.0031.00
50325797chucktoddTodd, ChuckNBC NewsM178124740.0031.00
71294756wolfblitzerBlitzer, WolfCNNM128191456.0030.00
28181835jpaceDCPace, JulieAssociated PressF4601752.0030.00
12354832kasieHunt, KasieNBC NewsF18735767.0029.00
16031927gretaVan Susteren, GretaMSNBCF118685037.0028.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM55762105.0027.00
217550862BresPoliticoBresnahan, JohnPoliticoM4056278.0027.00
\n", "
" ], "text/plain": [ " screen_name name organization gender \\\n", "user_id \n", "108617810 DanaBashCNN Bash, Dana CNN F \n", "14529929 jaketapper Tapper, Jake CNN M \n", "997684836 pkcapitol Kane, Paul Washington Post M \n", "407013776 burgessev Everett, John B. Politico M \n", "112526560 kenvogel Vogel, Kenneth P. Politico M \n", "18227519 morningmika Brzezinski, Mika MSNBC F \n", "123327472 peterbakernyt Baker, Peter New York Times M \n", "39155029 mkraju Raju, Manu K. CNN M \n", "13524182 daveweigel Weigel, David Washington Post M \n", "19186003 seungminkim Kim, Seung Min Politico F \n", "16930125 edatpost O’Keefe, Edward Washington Post M \n", "15931637 jonkarl Karl, Jonathan ABC News M \n", "22771961 Acosta Acosta, Jim CNN M \n", "19107878 GlennThrush Thrush, Glenn H. New York Times M \n", "18678924 jmartNYT Martin, Jonathan New York Times M \n", "61734492 Fahrenthold Fahrenthold, David Washington Post M \n", "16018516 jenhab Haberkorn, Jennifer A. Politico F \n", "33919343 AshleyRParker Parker, Ashley Washington Post F \n", "50325797 chucktodd Todd, Chuck NBC News M \n", "71294756 wolfblitzer Blitzer, Wolf CNN M \n", "28181835 jpaceDC Pace, Julie Associated Press F \n", "12354832 kasie Hunt, Kasie NBC News F \n", "16031927 greta Van Susteren, Greta MSNBC F \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News M \n", "217550862 BresPolitico Bresnahan, John Politico M \n", "\n", " followers_count mention_count mentioning_count \n", "user_id \n", "108617810 281861 115.00 55.00 \n", "14529929 1305680 127.00 51.00 \n", "997684836 31300 116.00 47.00 \n", "407013776 31010 212.00 46.00 \n", "112526560 53894 67.00 45.00 \n", "18227519 653031 70.00 44.00 \n", "123327472 96956 107.00 43.00 \n", "39155029 88366 95.00 43.00 \n", "13524182 332344 106.00 42.00 \n", "19186003 33980 143.00 41.00 \n", "16930125 58670 84.00 41.00 \n", "15931637 183467 104.00 40.00 \n", "22771961 350650 61.00 38.00 \n", "19107878 308181 78.00 37.00 \n", "18678924 197322 75.00 37.00 \n", "61734492 451778 43.00 32.00 \n", "16018516 20028 200.00 31.00 \n", "33919343 122382 100.00 31.00 \n", "50325797 1781247 40.00 31.00 \n", "71294756 1281914 56.00 30.00 \n", "28181835 46017 52.00 30.00 \n", "12354832 187357 67.00 29.00 \n", "16031927 1186850 37.00 28.00 \n", "46557945 55762 105.00 27.00 \n", "217550862 40562 78.00 27.00 " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mention_summary_df[journalist_mention_summary_fields].sort_values(['mentioning_count', 'mention_count'], ascending=False).head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists mentioning other journalists, how many are male / female?" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_mentions
index
M829858.0%6.39
F600042.0%6.04
\n", "
" ], "text/plain": [ " count percentage avg_mentions\n", "index \n", "M 8298 58.0% 6.39\n", "F 6000 42.0% 6.04" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_mention_gender_summary(journalists_mention_df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average how many times are journalists mentioned by other journalists?" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mention_count
count2,292.00
mean6.24
std17.59
min0.00
25%0.00
50%1.00
75%5.00
max330.00
\n", "
" ], "text/plain": [ " mention_count\n", "count 2,292.00\n", "mean 6.24\n", "std 17.59\n", "min 0.00\n", "25% 0.00\n", "50% 1.00\n", "75% 5.00\n", "max 330.00" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mention_summary_df[['mention_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists mentioning female journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists mentioning female journalists who is mentioned the most?" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
325050734AllysonRaeWxBanks, AllysonWUSA–TVF6918330.007.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF20028200.0031.00
19186003seungminkimKim, Seung MinPoliticoF33980143.0041.00
108617810DanaBashCNNBash, DanaCNNF281861115.0055.00
82151660kelsey_snellSnell, KelseWashington PostF8108109.0022.00
33919343AshleyRParkerParker, AshleyWashington PostF122382100.0031.00
52392666ZoeTillmanTillman, ZoeBuzzFeedF1524687.0014.00
26632935HopeSeckHodge Seck, HopeMilitary.comF458483.003.00
16441088jesteiSteinhauer, JenniferNew York TimesF1345276.0026.00
18227519morningmikaBrzezinski, MikaMSNBCF65303170.0044.00
12354832kasieHunt, KasieNBC NewsF18735767.0029.00
139738464mj_leeLee, MJCNNF3194067.0027.00
204599219pw_cunninghamCunningham, PaigeWashington ExaminerF925567.0018.00
118747545eilperinEilperin, JulietWashington PostF2048367.0016.00
360080772FoxReportsFox, LaurenCNNF728265.0015.00
58869089margarettalevTalev, MargaretBloomberg NewsF1958858.0027.00
313545488LauraLitvanLitvan, LauraBloomberg NewsF446858.005.00
19734832sarahkliffKliff, Sarah L.Vox MediaF10009057.0027.00
381664207caitlinnowensOwens, Caitlin N.AxiosF574957.009.00
167024520rachaelmbadeBade, Rachel M.PoliticoF3016456.0026.00
247852986rachanadixitPradhan, Rachana D.PoliticoF617855.0014.00
237477771juliehdavisDavis, JulieNew York TimesF4982155.0010.00
36607254Oriana0214Pawlyk, OrianaMilitary.comF639755.004.00
28181835jpaceDCPace, JulieAssociated PressF4601752.0030.00
48144950JudyWoodruffWoodruff, JudyPBS NewsHourF6429449.007.00
\n", "
" ], "text/plain": [ " screen_name name organization gender \\\n", "user_id \n", "325050734 AllysonRaeWx Banks, Allyson WUSA–TV F \n", "16018516 jenhab Haberkorn, Jennifer A. Politico F \n", "19186003 seungminkim Kim, Seung Min Politico F \n", "108617810 DanaBashCNN Bash, Dana CNN F \n", "82151660 kelsey_snell Snell, Kelse Washington Post F \n", "33919343 AshleyRParker Parker, Ashley Washington Post F \n", "52392666 ZoeTillman Tillman, Zoe BuzzFeed F \n", "26632935 HopeSeck Hodge Seck, Hope Military.com F \n", "16441088 jestei Steinhauer, Jennifer New York Times F \n", "18227519 morningmika Brzezinski, Mika MSNBC F \n", "12354832 kasie Hunt, Kasie NBC News F \n", "139738464 mj_lee Lee, MJ CNN F \n", "204599219 pw_cunningham Cunningham, Paige Washington Examiner F \n", "118747545 eilperin Eilperin, Juliet Washington Post F \n", "360080772 FoxReports Fox, Lauren CNN F \n", "58869089 margarettalev Talev, Margaret Bloomberg News F \n", "313545488 LauraLitvan Litvan, Laura Bloomberg News F \n", "19734832 sarahkliff Kliff, Sarah L. Vox Media F \n", "381664207 caitlinnowens Owens, Caitlin N. Axios F \n", "167024520 rachaelmbade Bade, Rachel M. Politico F \n", "247852986 rachanadixit Pradhan, Rachana D. Politico F \n", "237477771 juliehdavis Davis, Julie New York Times F \n", "36607254 Oriana0214 Pawlyk, Oriana Military.com F \n", "28181835 jpaceDC Pace, Julie Associated Press F \n", "48144950 JudyWoodruff Woodruff, Judy PBS NewsHour F \n", "\n", " followers_count mention_count mentioning_count \n", "user_id \n", "325050734 6918 330.00 7.00 \n", "16018516 20028 200.00 31.00 \n", "19186003 33980 143.00 41.00 \n", "108617810 281861 115.00 55.00 \n", "82151660 8108 109.00 22.00 \n", "33919343 122382 100.00 31.00 \n", "52392666 15246 87.00 14.00 \n", "26632935 4584 83.00 3.00 \n", "16441088 13452 76.00 26.00 \n", "18227519 653031 70.00 44.00 \n", "12354832 187357 67.00 29.00 \n", "139738464 31940 67.00 27.00 \n", "204599219 9255 67.00 18.00 \n", "118747545 20483 67.00 16.00 \n", "360080772 7282 65.00 15.00 \n", "58869089 19588 58.00 27.00 \n", "313545488 4468 58.00 5.00 \n", "19734832 100090 57.00 27.00 \n", "381664207 5749 57.00 9.00 \n", "167024520 30164 56.00 26.00 \n", "247852986 6178 55.00 14.00 \n", "237477771 49821 55.00 10.00 \n", "36607254 6397 55.00 4.00 \n", "28181835 46017 52.00 30.00 \n", "48144950 64294 49.00 7.00 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "female_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'F']\n", "female_journalists_mention_summary_df.to_csv('output/female_journalists_mentioned_by_journalists.csv')\n", "female_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times are female journalists mentioned by journalists?" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mention_count
count993.00
mean6.04
std17.95
min0.00
25%0.00
50%1.00
75%4.00
max330.00
\n", "
" ], "text/plain": [ " mention_count\n", "count 993.00\n", "mean 6.04\n", "std 17.95\n", "min 0.00\n", "25% 0.00\n", "50% 1.00\n", "75% 4.00\n", "max 330.00" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "female_journalists_mention_summary_df[['mention_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists mentioning male journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists mentioning male journalists, who do they mention the most?" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
28496589TenaciousTopperShutt, CharlesWUSA–TVM15868239.0013.00
63149389hbwxBernstein, HowardWUSA–TVM8337235.0010.00
407013776burgessevEverett, John B.PoliticoM31010212.0046.00
14529929jaketapperTapper, JakeCNNM1305680127.0051.00
169586280WaPoSeanSullivan, SeanWashington PostM22860117.0020.00
997684836pkcapitolKane, PaulWashington PostM31300116.0047.00
123327472peterbakernytBaker, PeterNew York TimesM96956107.0043.00
13524182daveweigelWeigel, DavidWashington PostM332344106.0042.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM55762105.0027.00
15931637jonkarlKarl, JonathanABC NewsM183467104.0040.00
9126752reporterjoeGould, Joseph M.Sightline Media GroupM470298.0016.00
39155029mkrajuRaju, Manu K.CNNM8836695.0043.00
16930125edatpostO’Keefe, EdwardWashington PostM5867084.0041.00
48802204HardballChrisMatthews, ChrisNBC NewsM71833080.009.00
19107878GlennThrushThrush, Glenn H.New York TimesM30818178.0037.00
217550862BresPoliticoBresnahan, JohnPoliticoM4056278.0027.00
24439201jameshohmannHohmann, James P.Washington PostM3870878.0027.00
18678924jmartNYTMartin, JonathanNew York TimesM19732275.0037.00
22891564chrisgeidnerGeidner, ChrisBuzzFeedM8331673.0015.00
112526560kenvogelVogel, Kenneth P.PoliticoM5389467.0045.00
18646108BretBaierBaier, BretFox NewsM109518466.0018.00
22771961AcostaAcosta, JimCNNM35065061.0038.00
16067683pauldemkoDemko, Paul JeffreyPoliticoM817060.0013.00
59676104danbalzBalz, DanielWashington PostM9081957.0026.00
71294756wolfblitzerBlitzer, WolfCNNM128191456.0030.00
\n", "
" ], "text/plain": [ " screen_name name organization gender \\\n", "user_id \n", "28496589 TenaciousTopper Shutt, Charles WUSA–TV M \n", "63149389 hbwx Bernstein, Howard WUSA–TV M \n", "407013776 burgessev Everett, John B. Politico M \n", "14529929 jaketapper Tapper, Jake CNN M \n", "169586280 WaPoSean Sullivan, Sean Washington Post M \n", "997684836 pkcapitol Kane, Paul Washington Post M \n", "123327472 peterbakernyt Baker, Peter New York Times M \n", "13524182 daveweigel Weigel, David Washington Post M \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News M \n", "15931637 jonkarl Karl, Jonathan ABC News M \n", "9126752 reporterjoe Gould, Joseph M. Sightline Media Group M \n", "39155029 mkraju Raju, Manu K. CNN M \n", "16930125 edatpost O’Keefe, Edward Washington Post M \n", "48802204 HardballChris Matthews, Chris NBC News M \n", "19107878 GlennThrush Thrush, Glenn H. New York Times M \n", "217550862 BresPolitico Bresnahan, John Politico M \n", "24439201 jameshohmann Hohmann, James P. Washington Post M \n", "18678924 jmartNYT Martin, Jonathan New York Times M \n", "22891564 chrisgeidner Geidner, Chris BuzzFeed M \n", "112526560 kenvogel Vogel, Kenneth P. Politico M \n", "18646108 BretBaier Baier, Bret Fox News M \n", "22771961 Acosta Acosta, Jim CNN M \n", "16067683 pauldemko Demko, Paul Jeffrey Politico M \n", "59676104 danbalz Balz, Daniel Washington Post M \n", "71294756 wolfblitzer Blitzer, Wolf CNN M \n", "\n", " followers_count mention_count mentioning_count \n", "user_id \n", "28496589 15868 239.00 13.00 \n", "63149389 8337 235.00 10.00 \n", "407013776 31010 212.00 46.00 \n", "14529929 1305680 127.00 51.00 \n", "169586280 22860 117.00 20.00 \n", "997684836 31300 116.00 47.00 \n", "123327472 96956 107.00 43.00 \n", "13524182 332344 106.00 42.00 \n", "46557945 55762 105.00 27.00 \n", "15931637 183467 104.00 40.00 \n", "9126752 4702 98.00 16.00 \n", "39155029 88366 95.00 43.00 \n", "16930125 58670 84.00 41.00 \n", "48802204 718330 80.00 9.00 \n", "19107878 308181 78.00 37.00 \n", "217550862 40562 78.00 27.00 \n", "24439201 38708 78.00 27.00 \n", "18678924 197322 75.00 37.00 \n", "22891564 83316 73.00 15.00 \n", "112526560 53894 67.00 45.00 \n", "18646108 1095184 66.00 18.00 \n", "22771961 350650 61.00 38.00 \n", "16067683 8170 60.00 13.00 \n", "59676104 90819 57.00 26.00 \n", "71294756 1281914 56.00 30.00 " ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_journalists_mention_summary_df = journalists_mention_summary_df[journalists_mention_summary_df.gender == 'M']\n", "male_journalists_mention_summary_df.to_csv('output/male_journalists_mentioned_by_journalists.csv')\n", "male_journalists_mention_summary_df[journalist_mention_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times are male journalists mentioned by journalists?" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mention_count
count1,299.00
mean6.39
std17.31
min0.00
25%0.00
50%1.00
75%5.00
max239.00
\n", "
" ], "text/plain": [ " mention_count\n", "count 1,299.00\n", "mean 6.39\n", "std 17.31\n", "min 0.00\n", "25% 0.00\n", "50% 1.00\n", "75% 5.00\n", "max 239.00" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_journalists_mention_summary_df[['mention_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Female journalists mentioning other journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of female journalists mentioning other journalists, who do they mention the most?" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
407013776burgessevEverett, John B.PoliticoM31010164.0020.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF20028116.0013.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM5576279.0010.00
169586280WaPoSeanSullivan, SeanWashington PostM2286071.0011.00
48802204HardballChrisMatthews, ChrisNBC NewsM71833070.003.00
19186003seungminkimKim, Seung MinPoliticoF3398064.0016.00
22891564chrisgeidnerGeidner, ChrisBuzzFeedM8331661.006.00
108617810DanaBashCNNBash, DanaCNNF28186160.0026.00
16067683pauldemkoDemko, Paul JeffreyPoliticoM817057.0010.00
313545488LauraLitvanLitvan, LauraBloomberg NewsF446853.002.00
52392666ZoeTillmanTillman, ZoeBuzzFeedF1524652.008.00
33919343AshleyRParkerParker, AshleyWashington PostF12238249.0011.00
82151660kelsey_snellSnell, KelseWashington PostF810847.0010.00
247852986rachanadixitPradhan, Rachana D.PoliticoF617843.007.00
9126752reporterjoeGould, Joseph M.Sightline Media GroupM470243.007.00
14529929jaketapperTapper, JakeCNNM130568040.0021.00
16930125edatpostO’Keefe, EdwardWashington PostM5867040.0018.00
217550862BresPoliticoBresnahan, JohnPoliticoM4056237.0013.00
16149614jrovnerRovner, JulieKaiser Health NewsF2184435.0014.00
997684836pkcapitolKane, PaulWashington PostM3130035.0013.00
12354832kasieHunt, KasieNBC NewsF18735735.0012.00
158072303ValerieInsinnaInsinna, ValerieDefense NewsF457235.002.00
15931637jonkarlKarl, JonathanABC NewsM18346733.0018.00
342226913GregStohrStohr, GregBloomberg NewsM724532.002.00
297532865kwelkernbcWelker, KristenNBC NewsF9923431.009.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "407013776 burgessev Everett, John B. Politico \n", "16018516 jenhab Haberkorn, Jennifer A. Politico \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News \n", "169586280 WaPoSean Sullivan, Sean Washington Post \n", "48802204 HardballChris Matthews, Chris NBC News \n", "19186003 seungminkim Kim, Seung Min Politico \n", "22891564 chrisgeidner Geidner, Chris BuzzFeed \n", "108617810 DanaBashCNN Bash, Dana CNN \n", "16067683 pauldemko Demko, Paul Jeffrey Politico \n", "313545488 LauraLitvan Litvan, Laura Bloomberg News \n", "52392666 ZoeTillman Tillman, Zoe BuzzFeed \n", "33919343 AshleyRParker Parker, Ashley Washington Post \n", "82151660 kelsey_snell Snell, Kelse Washington Post \n", "247852986 rachanadixit Pradhan, Rachana D. Politico \n", "9126752 reporterjoe Gould, Joseph M. Sightline Media Group \n", "14529929 jaketapper Tapper, Jake CNN \n", "16930125 edatpost O’Keefe, Edward Washington Post \n", "217550862 BresPolitico Bresnahan, John Politico \n", "16149614 jrovner Rovner, Julie Kaiser Health News \n", "997684836 pkcapitol Kane, Paul Washington Post \n", "12354832 kasie Hunt, Kasie NBC News \n", "158072303 ValerieInsinna Insinna, Valerie Defense News \n", "15931637 jonkarl Karl, Jonathan ABC News \n", "342226913 GregStohr Stohr, Greg Bloomberg News \n", "297532865 kwelkernbc Welker, Kristen NBC News \n", "\n", " gender followers_count mention_count mentioning_count \n", "user_id \n", "407013776 M 31010 164.00 20.00 \n", "16018516 F 20028 116.00 13.00 \n", "46557945 M 55762 79.00 10.00 \n", "169586280 M 22860 71.00 11.00 \n", "48802204 M 718330 70.00 3.00 \n", "19186003 F 33980 64.00 16.00 \n", "22891564 M 83316 61.00 6.00 \n", "108617810 F 281861 60.00 26.00 \n", "16067683 M 8170 57.00 10.00 \n", "313545488 F 4468 53.00 2.00 \n", "52392666 F 15246 52.00 8.00 \n", "33919343 F 122382 49.00 11.00 \n", "82151660 F 8108 47.00 10.00 \n", "247852986 F 6178 43.00 7.00 \n", "9126752 M 4702 43.00 7.00 \n", "14529929 M 1305680 40.00 21.00 \n", "16930125 M 58670 40.00 18.00 \n", "217550862 M 40562 37.00 13.00 \n", "16149614 F 21844 35.00 14.00 \n", "997684836 M 31300 35.00 13.00 \n", "12354832 F 187357 35.00 12.00 \n", "158072303 F 4572 35.00 2.00 \n", "15931637 M 183467 33.00 18.00 \n", "342226913 M 7245 32.00 2.00 \n", "297532865 F 99234 31.00 9.00 " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mentioned_by_female_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])\n", "journalists_mentioned_by_female_summary_df.to_csv('output/journalists_mentioned_by_female_journalists.csv')\n", "journalists_mentioned_by_female_summary_df[journalist_mention_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of female journalists mentioning journalists, how many are male / female?" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_mentions
index
M316254.8%2.43
F260545.2%2.62
\n", "
" ], "text/plain": [ " count percentage avg_mentions\n", "index \n", "M 3162 54.8% 2.43\n", "F 2605 45.2% 2.62" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'F'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Male journalists mentioning other journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of male journalists mentioning other journalists, who do they mention the most?" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countmention_countmentioning_count
user_id
325050734AllysonRaeWxBanks, AllysonWUSA–TVF6918324.004.00
28496589TenaciousTopperShutt, CharlesWUSA–TVM15868225.007.00
63149389hbwxBernstein, HowardWUSA–TVM8337225.004.00
14529929jaketapperTapper, JakeCNNM130568087.0030.00
13524182daveweigelWeigel, DavidWashington PostM33234484.0030.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF2002884.0018.00
997684836pkcapitolKane, PaulWashington PostM3130081.0034.00
19186003seungminkimKim, Seung MinPoliticoF3398079.0025.00
123327472peterbakernytBaker, PeterNew York TimesM9695678.0029.00
26632935HopeSeckHodge Seck, HopeMilitary.comF458476.001.00
15931637jonkarlKarl, JonathanABC NewsM18346771.0022.00
18678924jmartNYTMartin, JonathanNew York TimesM19732269.0031.00
39155029mkrajuRaju, Manu K.CNNM8836667.0027.00
19107878GlennThrushThrush, Glenn H.New York TimesM30818166.0029.00
16441088jesteiSteinhauer, JenniferNew York TimesF1345264.0017.00
82151660kelsey_snellSnell, KelseWashington PostF810862.0012.00
24439201jameshohmannHohmann, James P.Washington PostM3870859.0017.00
18646108BretBaierBaier, BretFox NewsM109518459.0014.00
108617810DanaBashCNNBash, DanaCNNF28186155.0029.00
9126752reporterjoeGould, Joseph M.Sightline Media GroupM470255.009.00
381664207caitlinnowensOwens, Caitlin N.AxiosF574955.007.00
33919343AshleyRParkerParker, AshleyWashington PostF12238251.0020.00
204599219pw_cunninghamCunningham, PaigeWashington ExaminerF925551.009.00
112526560kenvogelVogel, Kenneth P.PoliticoM5389450.0032.00
36607254Oriana0214Pawlyk, OrianaMilitary.comF639750.003.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "325050734 AllysonRaeWx Banks, Allyson WUSA–TV \n", "28496589 TenaciousTopper Shutt, Charles WUSA–TV \n", "63149389 hbwx Bernstein, Howard WUSA–TV \n", "14529929 jaketapper Tapper, Jake CNN \n", "13524182 daveweigel Weigel, David Washington Post \n", "16018516 jenhab Haberkorn, Jennifer A. Politico \n", "997684836 pkcapitol Kane, Paul Washington Post \n", "19186003 seungminkim Kim, Seung Min Politico \n", "123327472 peterbakernyt Baker, Peter New York Times \n", "26632935 HopeSeck Hodge Seck, Hope Military.com \n", "15931637 jonkarl Karl, Jonathan ABC News \n", "18678924 jmartNYT Martin, Jonathan New York Times \n", "39155029 mkraju Raju, Manu K. CNN \n", "19107878 GlennThrush Thrush, Glenn H. New York Times \n", "16441088 jestei Steinhauer, Jennifer New York Times \n", "82151660 kelsey_snell Snell, Kelse Washington Post \n", "24439201 jameshohmann Hohmann, James P. Washington Post \n", "18646108 BretBaier Baier, Bret Fox News \n", "108617810 DanaBashCNN Bash, Dana CNN \n", "9126752 reporterjoe Gould, Joseph M. Sightline Media Group \n", "381664207 caitlinnowens Owens, Caitlin N. Axios \n", "33919343 AshleyRParker Parker, Ashley Washington Post \n", "204599219 pw_cunningham Cunningham, Paige Washington Examiner \n", "112526560 kenvogel Vogel, Kenneth P. Politico \n", "36607254 Oriana0214 Pawlyk, Oriana Military.com \n", "\n", " gender followers_count mention_count mentioning_count \n", "user_id \n", "325050734 F 6918 324.00 4.00 \n", "28496589 M 15868 225.00 7.00 \n", "63149389 M 8337 225.00 4.00 \n", "14529929 M 1305680 87.00 30.00 \n", "13524182 M 332344 84.00 30.00 \n", "16018516 F 20028 84.00 18.00 \n", "997684836 M 31300 81.00 34.00 \n", "19186003 F 33980 79.00 25.00 \n", "123327472 M 96956 78.00 29.00 \n", "26632935 F 4584 76.00 1.00 \n", "15931637 M 183467 71.00 22.00 \n", "18678924 M 197322 69.00 31.00 \n", "39155029 M 88366 67.00 27.00 \n", "19107878 M 308181 66.00 29.00 \n", "16441088 F 13452 64.00 17.00 \n", "82151660 F 8108 62.00 12.00 \n", "24439201 M 38708 59.00 17.00 \n", "18646108 M 1095184 59.00 14.00 \n", "108617810 F 281861 55.00 29.00 \n", "9126752 M 4702 55.00 9.00 \n", "381664207 F 5749 55.00 7.00 \n", "33919343 F 122382 51.00 20.00 \n", "204599219 F 9255 51.00 9.00 \n", "112526560 M 53894 50.00 32.00 \n", "36607254 F 6397 50.00 3.00 " ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_mentioned_by_male_summary_df = journalist_mention_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])\n", "journalists_mentioned_by_male_summary_df.to_csv('output/journalists_mentioned_by_male_journalists.csv')\n", "journalists_mentioned_by_male_summary_df[journalist_mention_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of male journalists mentioning other journalists, how many are male / female?" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_mentions
index
M513660.2%3.95
F339539.8%3.42
\n", "
" ], "text/plain": [ " count percentage avg_mentions\n", "index \n", "M 5136 60.2% 3.95\n", "F 3395 39.8% 3.42" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_mention_gender_summary(journalists_mention_df[journalists_mention_df.gender == 'M'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retweet data prep" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load retweets from tweets\n", "Including retweets and quotes" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz\n", "DEBUG:root:Loaded 50000\n", "DEBUG:root:Loaded 100000\n", "DEBUG:root:Loaded 150000\n", "DEBUG:root:Loaded 200000\n", "DEBUG:root:Loaded 250000\n", "INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz\n", "DEBUG:root:Loaded 300000\n", "DEBUG:root:Loaded 350000\n", "DEBUG:root:Loaded 400000\n", "DEBUG:root:Loaded 450000\n", "DEBUG:root:Loaded 500000\n", "INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz\n", "DEBUG:root:Loaded 550000\n", "DEBUG:root:Loaded 600000\n", "DEBUG:root:Loaded 650000\n", "DEBUG:root:Loaded 700000\n", "DEBUG:root:Loaded 750000\n", "DEBUG:root:Loaded 800000\n" ] }, { "data": { "text/plain": [ "tweet_id 456956\n", "user_id 456956\n", "screen_name 456956\n", "retweet_user_id 456956\n", "retweet_screen_name 456956\n", "tweet_created_at 456956\n", "dtype: int64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Simply the tweet on load\n", "def retweet_transform(tweet):\n", " if tweet_type(tweet) in ('retweet', 'quote'):\n", " retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')\n", " return {\n", " 'tweet_id': tweet['id_str'],\n", " 'user_id': tweet['user']['id_str'],\n", " 'screen_name': tweet['user']['screen_name'],\n", " 'retweet_user_id': retweet['user']['id_str'],\n", " 'retweet_screen_name': retweet['user']['screen_name'],\n", " 'tweet_created_at': date_parse(tweet['created_at']) \n", " }\n", " return None\n", "\n", "base_retweet_df = load_tweet_df(retweet_transform, ['tweet_id', 'user_id', 'screen_name', 'retweet_user_id',\n", " 'retweet_screen_name', 'tweet_created_at'],\n", " dedupe_columns=['tweet_id'])\n", "\n", "base_retweet_df.count()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_iduser_idscreen_nameretweet_user_idretweet_screen_nametweet_created_at
0872631046088601600327862439jonathanvswan93069110maggieNYT2017-06-08 01:47:08+00:00
1872610483647516673327862439jonathanvswan160951141TomNamako2017-06-08 00:25:26+00:00
2872609618626826240327862439jonathanvswan18678924jmartNYT2017-06-08 00:22:00+00:00
3872605974699311104327862439jonathanvswan93069110maggieNYT2017-06-08 00:07:31+00:00
4872603191518646276327862439jonathanvswan94784682JonathanTurley2017-06-07 23:56:27+00:00
\n", "
" ], "text/plain": [ " tweet_id user_id screen_name retweet_user_id \\\n", "0 872631046088601600 327862439 jonathanvswan 93069110 \n", "1 872610483647516673 327862439 jonathanvswan 160951141 \n", "2 872609618626826240 327862439 jonathanvswan 18678924 \n", "3 872605974699311104 327862439 jonathanvswan 93069110 \n", "4 872603191518646276 327862439 jonathanvswan 94784682 \n", "\n", " retweet_screen_name tweet_created_at \n", "0 maggieNYT 2017-06-08 01:47:08+00:00 \n", "1 TomNamako 2017-06-08 00:25:26+00:00 \n", "2 jmartNYT 2017-06-08 00:22:00+00:00 \n", "3 maggieNYT 2017-06-08 00:07:31+00:00 \n", "4 JonathanTurley 2017-06-07 23:56:27+00:00 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_retweet_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add gender of retweeter" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tweet_id 456956\n", "user_id 456956\n", "screen_name 456956\n", "retweet_user_id 456956\n", "retweet_screen_name 456956\n", "tweet_created_at 456956\n", "gender 456956\n", "dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retweet_df = base_retweet_df.join(user_summary_df['gender'], on='user_id')\n", "retweet_df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How many users have been retweeted by journalists?" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "49154" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retweet_df['retweet_user_id'].unique().size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Limit to retweeted journalists" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tweet_id 117048\n", "user_id 117048\n", "screen_name 117048\n", "retweet_user_id 117048\n", "retweet_screen_name 117048\n", "tweet_created_at 117048\n", "gender 117048\n", "retweet_gender 117048\n", "dtype: int64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweet_df = retweet_df.join(user_summary_df['gender'], how='inner', on='retweet_user_id', rsuffix='_retweet')\n", "journalists_retweet_df.rename(columns = {'gender_retweet': 'retweet_gender'}, inplace=True)\n", "journalists_retweet_df.count()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet_iduser_idscreen_nameretweet_user_idretweet_screen_nametweet_created_atgenderretweet_gender
2872609618626826240327862439jonathanvswan18678924jmartNYT2017-06-08 00:22:00+00:00MM
435871437820044464128242169927colinwilhelm18678924jmartNYT2017-06-04 18:45:41+00:00MM
1406872620054889857024163589845PoliticoKevin18678924jmartNYT2017-06-08 01:03:28+00:00MM
1424872240756597174272163589845PoliticoKevin18678924jmartNYT2017-06-06 23:56:16+00:00MM
1455870749993279385601163589845PoliticoKevin18678924jmartNYT2017-06-02 21:12:30+00:00MM
\n", "
" ], "text/plain": [ " tweet_id user_id screen_name retweet_user_id \\\n", "2 872609618626826240 327862439 jonathanvswan 18678924 \n", "435 871437820044464128 242169927 colinwilhelm 18678924 \n", "1406 872620054889857024 163589845 PoliticoKevin 18678924 \n", "1424 872240756597174272 163589845 PoliticoKevin 18678924 \n", "1455 870749993279385601 163589845 PoliticoKevin 18678924 \n", "\n", " retweet_screen_name tweet_created_at gender retweet_gender \n", "2 jmartNYT 2017-06-08 00:22:00+00:00 M M \n", "435 jmartNYT 2017-06-04 18:45:41+00:00 M M \n", "1406 jmartNYT 2017-06-08 01:03:28+00:00 M M \n", "1424 jmartNYT 2017-06-06 23:56:16+00:00 M M \n", "1455 jmartNYT 2017-06-02 21:12:30+00:00 M M " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweet_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Functions for summarizing retweets by beltway journalists" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# Gender of beltway journalists retweeted by beltway journalists\n", "def journalist_retweet_gender_summary(retweet_df):\n", " gender_summary_df = pd.DataFrame({'count':retweet_df.retweet_gender.value_counts(), \n", " 'percentage': retweet_df.retweet_gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})\n", " gender_summary_df.reset_index(inplace=True)\n", " gender_summary_df['avg_retweets'] = gender_summary_df.apply(lambda row: row['count'] / journalist_gender_summary_df.loc[row['index']]['count'], axis=1) \n", " gender_summary_df.set_index('index', inplace=True, drop=True)\n", " return gender_summary_df\n", "\n", "\n", "def journalist_retweet_summary(retweet_df):\n", " # Retweet count\n", " retweet_count_df = pd.DataFrame(retweet_df.retweet_user_id.value_counts().rename('retweet_count'))\n", "\n", " # Retweeting users. That is, the number of unique users retweeting each user.\n", " retweet_user_id_per_user_df = retweet_df[['retweet_user_id', 'user_id']].drop_duplicates()\n", " retweeting_user_count_df = pd.DataFrame(retweet_user_id_per_user_df.groupby('retweet_user_id').size(), columns=['retweeting_count'])\n", " retweeting_user_count_df.index.name = 'user_id'\n", "\n", " # Join with user summary\n", " journalist_retweet_summary_df = user_summary_df.join([retweet_count_df, retweeting_user_count_df])\n", " journalist_retweet_summary_df.fillna(0, inplace=True)\n", " journalist_retweet_summary_df = journalist_retweet_summary_df.sort_values(['retweet_count', 'retweeting_count', 'followers_count'], ascending=False)\n", " return journalist_retweet_summary_df\n", "\n", "# Gender of top journalists retweeted by beltway journalists\n", "def top_journalist_retweet_gender_summary(retweet_summary_df, retweeting_count_threshold=0, head=100):\n", " top_retweet_summary_df = retweet_summary_df[retweet_summary_df.retweeting_count > retweeting_count_threshold].head(head)\n", " return pd.DataFrame({'count': top_retweet_summary_df.gender.value_counts(), \n", " 'percentage': top_retweet_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})\n", "\n", "# Fields for displaying journalist mention summaries\n", "journalist_retweet_summary_fields = ['screen_name', 'name', 'organization', 'gender', 'followers_count', 'retweet_count', 'retweeting_count']\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retweet analysis\n", "*Note that for each of these, the complete list is being written to CSV in the output directory.*\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retweets of all accounts (not just journalists)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting other accounts, how many of the retweets are from males / females?\n", "That is, by gender of retweeter." ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
retweetquotetotalpercentageavg_retweets
gender
F134,606.0038,998.00173,604.0038.0%174.83
M210,660.0072,692.00283,352.0062.0%218.13
\n", "
" ], "text/plain": [ " retweet quote total \\\n", "gender \n", "F 134,606.00 38,998.00 173,604.00 \n", "M 210,660.00 72,692.00 283,352.00 \n", "\n", " percentage avg_retweets \n", "gender \n", "F 38.0% 174.83 \n", "M 62.0% 218.13 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retweets_by_gender_df = user_summary_df[['gender', 'retweet', 'quote']].groupby('gender').sum()\n", "retweets_by_gender_df['total'] = retweets_by_gender_df.retweet + retweets_by_gender_df.quote\n", "retweets_by_gender_df['percentage'] = retweets_by_gender_df.total.div(retweets_by_gender_df.total.sum()).mul(100).round(1).astype(str) + '%'\n", "retweets_by_gender_df.reset_index(inplace=True)\n", "retweets_by_gender_df['avg_retweets'] = retweets_by_gender_df.apply(lambda row: row['total'] / journalist_gender_summary_df.loc[row['gender']]['count'], axis=1)\n", "retweets_by_gender_df.set_index('gender', inplace=True, drop=True)\n", "retweets_by_gender_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting other accounts, who retweets the most?" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_counttweet_countretweetquotetweets_in_datasetretweet_count
user_id
2453025128gloriaminottMinott, GloriaWPFW–FMF5866147321,524.000.0021,547.0021,524.00
304988603NeilWMcCabeMcCabe, NeilBreitbart NewsM18903646737,528.00625.009,370.008,153.00
18825339CahnEmilyCahn, EmilyMicF169801008034,449.001,834.008,196.006,283.00
191964162SamLitzingerLitzinger, SamCBS NewsM2329952366,017.00225.007,537.006,242.00
21612122HotlineJoshKraushaar, Josh P.National JournalM504381566104,881.00893.006,703.005,774.00
259395895JohnJHarwoodHarwood, JohnCNBCM149040780154,570.00822.006,377.005,392.00
16031927gretaVan Susteren, GretaMSNBCF1186850116645794.003,069.004,792.003,863.00
21810329sdonnanDonnan, ShawnFinancial TimesM12311791253,332.00449.004,537.003,781.00
47408060JonathanLandayLanday, JonathanMcClatchy NewspapersM11213810423,687.0080.004,285.003,767.00
13524182daveweigelWeigel, DavidWashington PostM3323441699082,703.00859.004,564.003,562.00
21696279brianbeutlerBeutler, Brian AlfredNew RepublicM74435990502,694.00684.004,560.003,378.00
104299137DavidMDruckerDrucker, DavidWashington ExaminerM350331046131,377.001,955.004,907.003,332.00
593813785DonnaYoungDCYoung, DonnaS&P Global Market IntelligenceF5894499671,740.001,327.004,414.003,067.00
456994513maria_e_recioRecio, MariaAustin American-StatesmanF1072408222,613.00336.003,370.002,949.00
19576571JaredRizziRizzi, JaredSirius XM Satellite RadioM13545416202,112.00828.005,567.002,940.00
16459325ryanbeckwithBeckwith, Ryan TeagueTime MagazineM20947922032,231.00521.005,187.002,752.00
14529929jaketapperTapper, JakeCNNM13056801481432,435.00287.005,078.002,722.00
61734492FahrentholdFahrenthold, DavidWashington PostM451778275732,505.00184.002,871.002,689.00
19545932kampeasKampeas, RonJewish Telegraphic AgencyM6977530531,988.00444.003,249.002,432.00
42352386rschlesSchlesinger, RobertU.S. News & World ReportM4553353751,644.00617.002,459.002,261.00
25702314EricMGarciaGarcia, Eric M.CQ Roll CallM309444783528.001,723.003,584.002,251.00
18646108BretBaierBaier, BretFox NewsM1095184522711,623.00615.002,379.002,238.00
15486163SimonMarksFSNMarks, SimonFeature Story NewsM7767415411,296.00934.003,432.002,230.00
18678924jmartNYTMartin, JonathanNew York TimesM1973221069701,665.00467.002,810.002,132.00
15730608edrosoEdroso, RoyUCGM4696380641,714.00379.002,883.002,093.00
\n", "
" ], "text/plain": [ " screen_name name \\\n", "user_id \n", "2453025128 gloriaminott Minott, Gloria \n", "304988603 NeilWMcCabe McCabe, Neil \n", "18825339 CahnEmily Cahn, Emily \n", "191964162 SamLitzinger Litzinger, Sam \n", "21612122 HotlineJosh Kraushaar, Josh P. \n", "259395895 JohnJHarwood Harwood, John \n", "16031927 greta Van Susteren, Greta \n", "21810329 sdonnan Donnan, Shawn \n", "47408060 JonathanLanday Landay, Jonathan \n", "13524182 daveweigel Weigel, David \n", "21696279 brianbeutler Beutler, Brian Alfred \n", "104299137 DavidMDrucker Drucker, David \n", "593813785 DonnaYoungDC Young, Donna \n", "456994513 maria_e_recio Recio, Maria \n", "19576571 JaredRizzi Rizzi, Jared \n", "16459325 ryanbeckwith Beckwith, Ryan Teague \n", "14529929 jaketapper Tapper, Jake \n", "61734492 Fahrenthold Fahrenthold, David \n", "19545932 kampeas Kampeas, Ron \n", "42352386 rschles Schlesinger, Robert \n", "25702314 EricMGarcia Garcia, Eric M. \n", "18646108 BretBaier Baier, Bret \n", "15486163 SimonMarksFSN Marks, Simon \n", "18678924 jmartNYT Martin, Jonathan \n", "15730608 edroso Edroso, Roy \n", "\n", " organization gender followers_count \\\n", "user_id \n", "2453025128 WPFW–FM F 586 \n", "304988603 Breitbart News M 18903 \n", "18825339 Mic F 16980 \n", "191964162 CBS News M 2329 \n", "21612122 National Journal M 50438 \n", "259395895 CNBC M 149040 \n", "16031927 MSNBC F 1186850 \n", "21810329 Financial Times M 12311 \n", "47408060 McClatchy Newspapers M 11213 \n", "13524182 Washington Post M 332344 \n", "21696279 New Republic M 74435 \n", "104299137 Washington Examiner M 35033 \n", "593813785 S&P Global Market Intelligence F 5894 \n", "456994513 Austin American-Statesman F 1072 \n", "19576571 Sirius XM Satellite Radio M 13545 \n", "16459325 Time Magazine M 20947 \n", "14529929 CNN M 1305680 \n", "61734492 Washington Post M 451778 \n", "19545932 Jewish Telegraphic Agency M 6977 \n", "42352386 U.S. News & World Report M 4553 \n", "25702314 CQ Roll Call M 3094 \n", "18646108 Fox News M 1095184 \n", "15486163 Feature Story News M 7767 \n", "18678924 New York Times M 197322 \n", "15730608 UCG M 4696 \n", "\n", " tweet_count retweet quote \\\n", "user_id \n", "2453025128 61473 21,524.00 0.00 \n", "304988603 64673 7,528.00 625.00 \n", "18825339 100803 4,449.00 1,834.00 \n", "191964162 95236 6,017.00 225.00 \n", "21612122 156610 4,881.00 893.00 \n", "259395895 78015 4,570.00 822.00 \n", "16031927 116645 794.00 3,069.00 \n", "21810329 79125 3,332.00 449.00 \n", "47408060 81042 3,687.00 80.00 \n", "13524182 169908 2,703.00 859.00 \n", "21696279 99050 2,694.00 684.00 \n", "104299137 104613 1,377.00 1,955.00 \n", "593813785 49967 1,740.00 1,327.00 \n", "456994513 40822 2,613.00 336.00 \n", "19576571 41620 2,112.00 828.00 \n", "16459325 92203 2,231.00 521.00 \n", "14529929 148143 2,435.00 287.00 \n", "61734492 27573 2,505.00 184.00 \n", "19545932 53053 1,988.00 444.00 \n", "42352386 35375 1,644.00 617.00 \n", "25702314 44783 528.00 1,723.00 \n", "18646108 52271 1,623.00 615.00 \n", "15486163 41541 1,296.00 934.00 \n", "18678924 106970 1,665.00 467.00 \n", "15730608 38064 1,714.00 379.00 \n", "\n", " tweets_in_dataset retweet_count \n", "user_id \n", "2453025128 21,547.00 21,524.00 \n", "304988603 9,370.00 8,153.00 \n", "18825339 8,196.00 6,283.00 \n", "191964162 7,537.00 6,242.00 \n", "21612122 6,703.00 5,774.00 \n", "259395895 6,377.00 5,392.00 \n", "16031927 4,792.00 3,863.00 \n", "21810329 4,537.00 3,781.00 \n", "47408060 4,285.00 3,767.00 \n", "13524182 4,564.00 3,562.00 \n", "21696279 4,560.00 3,378.00 \n", "104299137 4,907.00 3,332.00 \n", "593813785 4,414.00 3,067.00 \n", "456994513 3,370.00 2,949.00 \n", "19576571 5,567.00 2,940.00 \n", "16459325 5,187.00 2,752.00 \n", "14529929 5,078.00 2,722.00 \n", "61734492 2,871.00 2,689.00 \n", "19545932 3,249.00 2,432.00 \n", "42352386 2,459.00 2,261.00 \n", "25702314 3,584.00 2,251.00 \n", "18646108 2,379.00 2,238.00 \n", "15486163 3,432.00 2,230.00 \n", "18678924 2,810.00 2,132.00 \n", "15730608 2,883.00 2,093.00 " ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "retweet_user_summary_df = user_summary_df.loc[:,('screen_name', 'name', 'organization', 'gender', 'followers_count', 'tweet_count', 'retweet', 'quote', 'tweets_in_dataset')]\n", "retweet_user_summary_df['retweet_count'] = retweet_user_summary_df.retweet + retweet_user_summary_df.quote\n", "retweet_user_summary_df.sort_values(['retweet_count'], ascending=False).head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting other accounts, who is retweeted the most?\n", "This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely." ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
retweet_countretweeting_count
realDonaldTrump6650807
thehill5424457
BraddJaffy3564554
maggieNYT3024530
business3000229
washingtonpost2638498
AP2480581
politico2335334
nytimes2268485
WSJ1949213
burgessev1836289
kylegriffin11803429
ZekeJMiller1723387
CNN1602366
GlennThrush1577451
Reuters1487265
jaketapper1459397
TheEconomist145886
StevenTDennis1403280
FoxNews1400258
seungminkim1393327
mkraju1359341
PhilipRucker1349365
markknoller1343341
MEPFuller1324286
\n", "
" ], "text/plain": [ " retweet_count retweeting_count\n", "realDonaldTrump 6650 807\n", "thehill 5424 457\n", "BraddJaffy 3564 554\n", "maggieNYT 3024 530\n", "business 3000 229\n", "washingtonpost 2638 498\n", "AP 2480 581\n", "politico 2335 334\n", "nytimes 2268 485\n", "WSJ 1949 213\n", "burgessev 1836 289\n", "kylegriffin1 1803 429\n", "ZekeJMiller 1723 387\n", "CNN 1602 366\n", "GlennThrush 1577 451\n", "Reuters 1487 265\n", "jaketapper 1459 397\n", "TheEconomist 1458 86\n", "StevenTDennis 1403 280\n", "FoxNews 1400 258\n", "seungminkim 1393 327\n", "mkraju 1359 341\n", "PhilipRucker 1349 365\n", "markknoller 1343 341\n", "MEPFuller 1324 286" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Retweet count\n", "retweet_count_screen_name_df = pd.DataFrame(retweet_df.retweet_screen_name.value_counts().rename('retweet_count'))\n", "\n", "# Count of retweeting users\n", "retweet_user_id_per_user_screen_name_df = retweet_df[['retweet_screen_name', 'user_id']].drop_duplicates()\n", "retweeting_count_screen_name_df = pd.DataFrame(retweet_user_id_per_user_screen_name_df.groupby('retweet_screen_name').size(), columns=['retweeting_count'])\n", "retweeting_count_screen_name_df.index.name = 'screen_name'\n", "\n", "all_retweeted_df = retweet_count_screen_name_df.join(retweeting_count_screen_name_df)\n", "all_retweeted_df.to_csv('output/all_retweeted_by_journalists.csv')\n", "all_retweeted_df.head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists retweeting other journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting other journalists, who is retweeted the most?" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countretweet_countretweeting_count
user_id
407013776burgessevEverett, John B.PoliticoM310101,836.00289.00
21316253ZekeJMillerMiller, Zeke J.Time MagazineM1985171,723.00387.00
19107878GlennThrushThrush, Glenn H.New York TimesM3081811,577.00451.00
14529929jaketapperTapper, JakeCNNM13056801,459.00397.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM557621,403.00280.00
19186003seungminkimKim, Seung MinPoliticoF339801,393.00327.00
39155029mkrajuRaju, Manu K.CNNM883661,359.00341.00
31127446markknollerKnoller, MarkCBS NewsM3014741,343.00341.00
398088661MEPFullerFuller, Matt E.Huffington PostM779191,324.00286.00
13524182daveweigelWeigel, DavidWashington PostM3323441,221.00306.00
14007532frankthorpThorp, FrankNBC NewsM397981,207.00334.00
19847765sahilkapurKapur, SahilBloomberg NewsM690861,186.00296.00
16187637ChadPergramPergram, ChadFox NewsM593051,177.00297.00
104914594Phil_MattinglyMattingly, PhilCNNM401191,120.00314.00
16006592BenjySarlinSarlin, BenjaminNBC NewsM780751,039.00215.00
259395895JohnJHarwoodHarwood, JohnCNBCM1490401,011.00277.00
21252618JakeShermanSherman, Jacob S.PoliticoM81762943.00281.00
33653195ericawernerWerner, EricaAssociated PressF14049939.00281.00
18678924jmartNYTMartin, JonathanNew York TimesM197322916.00247.00
12354832kasieHunt, KasieNBC NewsF187357909.00388.00
70511174Hadas_GoldGold, HadasPoliticoF45221849.00306.00
22771961AcostaAcosta, JimCNNM350650829.00315.00
104299137DavidMDruckerDrucker, DavidWashington ExaminerM35033770.00193.00
593813785DonnaYoungDCYoung, DonnaS&P Global Market IntelligenceF5894708.0013.00
118130765dylanlscottScott, Dylan L.Stat NewsM20122705.00155.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "407013776 burgessev Everett, John B. Politico \n", "21316253 ZekeJMiller Miller, Zeke J. Time Magazine \n", "19107878 GlennThrush Thrush, Glenn H. New York Times \n", "14529929 jaketapper Tapper, Jake CNN \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News \n", "19186003 seungminkim Kim, Seung Min Politico \n", "39155029 mkraju Raju, Manu K. CNN \n", "31127446 markknoller Knoller, Mark CBS News \n", "398088661 MEPFuller Fuller, Matt E. Huffington Post \n", "13524182 daveweigel Weigel, David Washington Post \n", "14007532 frankthorp Thorp, Frank NBC News \n", "19847765 sahilkapur Kapur, Sahil Bloomberg News \n", "16187637 ChadPergram Pergram, Chad Fox News \n", "104914594 Phil_Mattingly Mattingly, Phil CNN \n", "16006592 BenjySarlin Sarlin, Benjamin NBC News \n", "259395895 JohnJHarwood Harwood, John CNBC \n", "21252618 JakeSherman Sherman, Jacob S. Politico \n", "33653195 ericawerner Werner, Erica Associated Press \n", "18678924 jmartNYT Martin, Jonathan New York Times \n", "12354832 kasie Hunt, Kasie NBC News \n", "70511174 Hadas_Gold Gold, Hadas Politico \n", "22771961 Acosta Acosta, Jim CNN \n", "104299137 DavidMDrucker Drucker, David Washington Examiner \n", "593813785 DonnaYoungDC Young, Donna S&P Global Market Intelligence \n", "118130765 dylanlscott Scott, Dylan L. Stat News \n", "\n", " gender followers_count retweet_count retweeting_count \n", "user_id \n", "407013776 M 31010 1,836.00 289.00 \n", "21316253 M 198517 1,723.00 387.00 \n", "19107878 M 308181 1,577.00 451.00 \n", "14529929 M 1305680 1,459.00 397.00 \n", "46557945 M 55762 1,403.00 280.00 \n", "19186003 F 33980 1,393.00 327.00 \n", "39155029 M 88366 1,359.00 341.00 \n", "31127446 M 301474 1,343.00 341.00 \n", "398088661 M 77919 1,324.00 286.00 \n", "13524182 M 332344 1,221.00 306.00 \n", "14007532 M 39798 1,207.00 334.00 \n", "19847765 M 69086 1,186.00 296.00 \n", "16187637 M 59305 1,177.00 297.00 \n", "104914594 M 40119 1,120.00 314.00 \n", "16006592 M 78075 1,039.00 215.00 \n", "259395895 M 149040 1,011.00 277.00 \n", "21252618 M 81762 943.00 281.00 \n", "33653195 F 14049 939.00 281.00 \n", "18678924 M 197322 916.00 247.00 \n", "12354832 F 187357 909.00 388.00 \n", "70511174 F 45221 849.00 306.00 \n", "22771961 M 350650 829.00 315.00 \n", "104299137 M 35033 770.00 193.00 \n", "593813785 F 5894 708.00 13.00 \n", "118130765 M 20122 705.00 155.00 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweet_summary_df = journalist_retweet_summary(journalists_retweet_df)\n", "journalists_retweet_summary_df.to_csv('output/journalists_retweeted_by_journalists.csv')\n", "journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting other journalists, how many of the retweets are of males / females?" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_retweets
index
M8063468.9%62.07
F3641431.1%36.67
\n", "
" ], "text/plain": [ " count percentage avg_retweets\n", "index \n", "M 80634 68.9% 62.07\n", "F 36414 31.1% 36.67" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_retweet_gender_summary(journalists_retweet_df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times are journalists retweeted by other journalists?" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
retweet_count
count2,292.00
mean51.07
std149.06
min0.00
25%0.00
50%6.00
75%33.00
max1,836.00
\n", "
" ], "text/plain": [ " retweet_count\n", "count 2,292.00\n", "mean 51.07\n", "std 149.06\n", "min 0.00\n", "25% 0.00\n", "50% 6.00\n", "75% 33.00\n", "max 1,836.00" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweet_summary_df[['retweet_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists retweeting female journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting female journalists, who is retweeted the most?" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countretweet_countretweeting_count
user_id
19186003seungminkimKim, Seung MinPoliticoF339801,393.00327.00
33653195ericawernerWerner, EricaAssociated PressF14049939.00281.00
12354832kasieHunt, KasieNBC NewsF187357909.00388.00
70511174Hadas_GoldGold, HadasPoliticoF45221849.00306.00
593813785DonnaYoungDCYoung, DonnaS&P Global Market IntelligenceF5894708.0013.00
167024520rachaelmbadeBade, Rachel M.PoliticoF30164614.00161.00
33919343AshleyRParkerParker, AshleyWashington PostF122382539.00268.00
139738464mj_leeLee, MJCNNF31940518.00189.00
16018516jenhabHaberkorn, Jennifer A.PoliticoF20028474.00136.00
18825339CahnEmilyCahn, EmilyMicF16980444.00118.00
45399148jenepsEpstein, JenniferBloomberg NewsF61242443.00189.00
705706292rebeccaballhausBallhaus, RebeccaWall Street Journal / Dow JonesF24638409.00154.00
19734832sarahkliffKliff, Sarah L.Vox MediaF100090392.00136.00
163995093AlexNBCNewsMoe, AlexandraNBC NewsF21689388.00134.00
237477771juliehdavisDavis, JulieNew York TimesF49821375.00194.00
16149614jrovnerRovner, JulieKaiser Health NewsF21844351.00137.00
116341480RosieGrayGray, RosieThe AtlanticF96935345.00125.00
28181835jpaceDCPace, JulieAssociated PressF46017328.00132.00
52392666ZoeTillmanTillman, ZoeBuzzFeedF15246312.0070.00
906734342KimberlyRobinsnRobinson, Kimberly S.Bloomberg BNAF7170308.0038.00
188857501alexis_levinsonLevinson, Alexis R.BuzzFeedF25375288.00111.00
56552341LACaldwellDCCaldwell, Leigh AnnNBC NewsF8464282.0098.00
151444950DaviSusanDavis, SusanNational Public RadioF27297270.00150.00
360080772FoxReportsFox, LaurenCNNF7282269.00116.00
313545488LauraLitvanLitvan, LauraBloomberg NewsF4468269.00115.00
\n", "
" ], "text/plain": [ " screen_name name \\\n", "user_id \n", "19186003 seungminkim Kim, Seung Min \n", "33653195 ericawerner Werner, Erica \n", "12354832 kasie Hunt, Kasie \n", "70511174 Hadas_Gold Gold, Hadas \n", "593813785 DonnaYoungDC Young, Donna \n", "167024520 rachaelmbade Bade, Rachel M. \n", "33919343 AshleyRParker Parker, Ashley \n", "139738464 mj_lee Lee, MJ \n", "16018516 jenhab Haberkorn, Jennifer A. \n", "18825339 CahnEmily Cahn, Emily \n", "45399148 jeneps Epstein, Jennifer \n", "705706292 rebeccaballhaus Ballhaus, Rebecca \n", "19734832 sarahkliff Kliff, Sarah L. \n", "163995093 AlexNBCNews Moe, Alexandra \n", "237477771 juliehdavis Davis, Julie \n", "16149614 jrovner Rovner, Julie \n", "116341480 RosieGray Gray, Rosie \n", "28181835 jpaceDC Pace, Julie \n", "52392666 ZoeTillman Tillman, Zoe \n", "906734342 KimberlyRobinsn Robinson, Kimberly S. \n", "188857501 alexis_levinson Levinson, Alexis R. \n", "56552341 LACaldwellDC Caldwell, Leigh Ann \n", "151444950 DaviSusan Davis, Susan \n", "360080772 FoxReports Fox, Lauren \n", "313545488 LauraLitvan Litvan, Laura \n", "\n", " organization gender followers_count \\\n", "user_id \n", "19186003 Politico F 33980 \n", "33653195 Associated Press F 14049 \n", "12354832 NBC News F 187357 \n", "70511174 Politico F 45221 \n", "593813785 S&P Global Market Intelligence F 5894 \n", "167024520 Politico F 30164 \n", "33919343 Washington Post F 122382 \n", "139738464 CNN F 31940 \n", "16018516 Politico F 20028 \n", "18825339 Mic F 16980 \n", "45399148 Bloomberg News F 61242 \n", "705706292 Wall Street Journal / Dow Jones F 24638 \n", "19734832 Vox Media F 100090 \n", "163995093 NBC News F 21689 \n", "237477771 New York Times F 49821 \n", "16149614 Kaiser Health News F 21844 \n", "116341480 The Atlantic F 96935 \n", "28181835 Associated Press F 46017 \n", "52392666 BuzzFeed F 15246 \n", "906734342 Bloomberg BNA F 7170 \n", "188857501 BuzzFeed F 25375 \n", "56552341 NBC News F 8464 \n", "151444950 National Public Radio F 27297 \n", "360080772 CNN F 7282 \n", "313545488 Bloomberg News F 4468 \n", "\n", " retweet_count retweeting_count \n", "user_id \n", "19186003 1,393.00 327.00 \n", "33653195 939.00 281.00 \n", "12354832 909.00 388.00 \n", "70511174 849.00 306.00 \n", "593813785 708.00 13.00 \n", "167024520 614.00 161.00 \n", "33919343 539.00 268.00 \n", "139738464 518.00 189.00 \n", "16018516 474.00 136.00 \n", "18825339 444.00 118.00 \n", "45399148 443.00 189.00 \n", "705706292 409.00 154.00 \n", "19734832 392.00 136.00 \n", "163995093 388.00 134.00 \n", "237477771 375.00 194.00 \n", "16149614 351.00 137.00 \n", "116341480 345.00 125.00 \n", "28181835 328.00 132.00 \n", "52392666 312.00 70.00 \n", "906734342 308.00 38.00 \n", "188857501 288.00 111.00 \n", "56552341 282.00 98.00 \n", "151444950 270.00 150.00 \n", "360080772 269.00 116.00 \n", "313545488 269.00 115.00 " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "female_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'F']\n", "female_journalists_retweet_summary_df.to_csv('output/female_journalists_retweeted_by_journalists.csv')\n", "female_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times are female journalists retweeted by other journalists?" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
retweet_count
count993.00
mean36.67
std97.34
min0.00
25%0.00
50%5.00
75%25.00
max1,393.00
\n", "
" ], "text/plain": [ " retweet_count\n", "count 993.00\n", "mean 36.67\n", "std 97.34\n", "min 0.00\n", "25% 0.00\n", "50% 5.00\n", "75% 25.00\n", "max 1,393.00" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "female_journalists_retweet_summary_df[['retweet_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Journalists retweeting male journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of journalists retweeting male journalists, who is retweeted the most?" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countretweet_countretweeting_count
user_id
407013776burgessevEverett, John B.PoliticoM310101,836.00289.00
21316253ZekeJMillerMiller, Zeke J.Time MagazineM1985171,723.00387.00
19107878GlennThrushThrush, Glenn H.New York TimesM3081811,577.00451.00
14529929jaketapperTapper, JakeCNNM13056801,459.00397.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM557621,403.00280.00
39155029mkrajuRaju, Manu K.CNNM883661,359.00341.00
31127446markknollerKnoller, MarkCBS NewsM3014741,343.00341.00
398088661MEPFullerFuller, Matt E.Huffington PostM779191,324.00286.00
13524182daveweigelWeigel, DavidWashington PostM3323441,221.00306.00
14007532frankthorpThorp, FrankNBC NewsM397981,207.00334.00
19847765sahilkapurKapur, SahilBloomberg NewsM690861,186.00296.00
16187637ChadPergramPergram, ChadFox NewsM593051,177.00297.00
104914594Phil_MattinglyMattingly, PhilCNNM401191,120.00314.00
16006592BenjySarlinSarlin, BenjaminNBC NewsM780751,039.00215.00
259395895JohnJHarwoodHarwood, JohnCNBCM1490401,011.00277.00
21252618JakeShermanSherman, Jacob S.PoliticoM81762943.00281.00
18678924jmartNYTMartin, JonathanNew York TimesM197322916.00247.00
22771961AcostaAcosta, JimCNNM350650829.00315.00
104299137DavidMDruckerDrucker, DavidWashington ExaminerM35033770.00193.00
118130765dylanlscottScott, Dylan L.Stat NewsM20122705.00155.00
3817401ericgellerGeller, EricPoliticoM58173704.00225.00
217550862BresPoliticoBresnahan, JohnPoliticoM40562699.00223.00
22129280jimsciuttoSciutto, JamesCNNM172012688.00242.00
61734492FahrentholdFahrenthold, DavidWashington PostM451778654.00284.00
15463671samsteinStein, SamHuffington PostM313211642.00229.00
\n", "
" ], "text/plain": [ " screen_name name organization gender \\\n", "user_id \n", "407013776 burgessev Everett, John B. Politico M \n", "21316253 ZekeJMiller Miller, Zeke J. Time Magazine M \n", "19107878 GlennThrush Thrush, Glenn H. New York Times M \n", "14529929 jaketapper Tapper, Jake CNN M \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News M \n", "39155029 mkraju Raju, Manu K. CNN M \n", "31127446 markknoller Knoller, Mark CBS News M \n", "398088661 MEPFuller Fuller, Matt E. Huffington Post M \n", "13524182 daveweigel Weigel, David Washington Post M \n", "14007532 frankthorp Thorp, Frank NBC News M \n", "19847765 sahilkapur Kapur, Sahil Bloomberg News M \n", "16187637 ChadPergram Pergram, Chad Fox News M \n", "104914594 Phil_Mattingly Mattingly, Phil CNN M \n", "16006592 BenjySarlin Sarlin, Benjamin NBC News M \n", "259395895 JohnJHarwood Harwood, John CNBC M \n", "21252618 JakeSherman Sherman, Jacob S. Politico M \n", "18678924 jmartNYT Martin, Jonathan New York Times M \n", "22771961 Acosta Acosta, Jim CNN M \n", "104299137 DavidMDrucker Drucker, David Washington Examiner M \n", "118130765 dylanlscott Scott, Dylan L. Stat News M \n", "3817401 ericgeller Geller, Eric Politico M \n", "217550862 BresPolitico Bresnahan, John Politico M \n", "22129280 jimsciutto Sciutto, James CNN M \n", "61734492 Fahrenthold Fahrenthold, David Washington Post M \n", "15463671 samstein Stein, Sam Huffington Post M \n", "\n", " followers_count retweet_count retweeting_count \n", "user_id \n", "407013776 31010 1,836.00 289.00 \n", "21316253 198517 1,723.00 387.00 \n", "19107878 308181 1,577.00 451.00 \n", "14529929 1305680 1,459.00 397.00 \n", "46557945 55762 1,403.00 280.00 \n", "39155029 88366 1,359.00 341.00 \n", "31127446 301474 1,343.00 341.00 \n", "398088661 77919 1,324.00 286.00 \n", "13524182 332344 1,221.00 306.00 \n", "14007532 39798 1,207.00 334.00 \n", "19847765 69086 1,186.00 296.00 \n", "16187637 59305 1,177.00 297.00 \n", "104914594 40119 1,120.00 314.00 \n", "16006592 78075 1,039.00 215.00 \n", "259395895 149040 1,011.00 277.00 \n", "21252618 81762 943.00 281.00 \n", "18678924 197322 916.00 247.00 \n", "22771961 350650 829.00 315.00 \n", "104299137 35033 770.00 193.00 \n", "118130765 20122 705.00 155.00 \n", "3817401 58173 704.00 225.00 \n", "217550862 40562 699.00 223.00 \n", "22129280 172012 688.00 242.00 \n", "61734492 451778 654.00 284.00 \n", "15463671 313211 642.00 229.00 " ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_journalists_retweet_summary_df = journalists_retweet_summary_df[journalists_retweet_summary_df.gender == 'M']\n", "male_journalists_retweet_summary_df.to_csv('output/male_journalists_retweeted_by_journalists.csv')\n", "male_journalists_retweet_summary_df[journalist_retweet_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times are male journalists retweeted by other journalists?" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
retweet_count
count1,299.00
mean62.07
std178.04
min0.00
25%1.00
50%8.00
75%39.50
max1,836.00
\n", "
" ], "text/plain": [ " retweet_count\n", "count 1,299.00\n", "mean 62.07\n", "std 178.04\n", "min 0.00\n", "25% 1.00\n", "50% 8.00\n", "75% 39.50\n", "max 1,836.00" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_journalists_retweet_summary_df[['retweet_count']].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Female journalists retweeting other journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of female journalists retweeting other journalists, who is retweeted the most?" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countretweet_countretweeting_count
user_id
407013776burgessevEverett, John B.PoliticoM31010748.00122.00
593813785DonnaYoungDCYoung, DonnaS&P Global Market IntelligenceF5894704.009.00
19186003seungminkimKim, Seung MinPoliticoF33980572.00142.00
31127446markknollerKnoller, MarkCBS NewsM301474549.00140.00
21316253ZekeJMillerMiller, Zeke J.Time MagazineM198517516.00149.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM55762503.0097.00
14007532frankthorpThorp, FrankNBC NewsM39798470.00140.00
19107878GlennThrushThrush, Glenn H.New York TimesM308181463.00165.00
33653195ericawernerWerner, EricaAssociated PressF14049452.00119.00
398088661MEPFullerFuller, Matt E.Huffington PostM77919447.00116.00
39155029mkrajuRaju, Manu K.CNNM88366403.00132.00
14529929jaketapperTapper, JakeCNNM1305680388.00158.00
104914594Phil_MattinglyMattingly, PhilCNNM40119372.00129.00
118130765dylanlscottScott, Dylan L.Stat NewsM20122367.0067.00
16187637ChadPergramPergram, ChadFox NewsM59305365.00122.00
12354832kasieHunt, KasieNBC NewsF187357344.00164.00
19847765sahilkapurKapur, SahilBloomberg NewsM69086338.00103.00
167024520rachaelmbadeBade, Rachel M.PoliticoF30164303.0059.00
21252618JakeShermanSherman, Jacob S.PoliticoM81762302.00106.00
22891564chrisgeidnerGeidner, ChrisBuzzFeedM83316287.0061.00
70511174Hadas_GoldGold, HadasPoliticoF45221279.00111.00
22771961AcostaAcosta, JimCNNM350650265.00119.00
139738464mj_leeLee, MJCNNF31940259.0079.00
217550862BresPoliticoBresnahan, JohnPoliticoM40562256.0082.00
61734492FahrentholdFahrenthold, DavidWashington PostM451778253.00115.00
\n", "
" ], "text/plain": [ " screen_name name organization \\\n", "user_id \n", "407013776 burgessev Everett, John B. Politico \n", "593813785 DonnaYoungDC Young, Donna S&P Global Market Intelligence \n", "19186003 seungminkim Kim, Seung Min Politico \n", "31127446 markknoller Knoller, Mark CBS News \n", "21316253 ZekeJMiller Miller, Zeke J. Time Magazine \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News \n", "14007532 frankthorp Thorp, Frank NBC News \n", "19107878 GlennThrush Thrush, Glenn H. New York Times \n", "33653195 ericawerner Werner, Erica Associated Press \n", "398088661 MEPFuller Fuller, Matt E. Huffington Post \n", "39155029 mkraju Raju, Manu K. CNN \n", "14529929 jaketapper Tapper, Jake CNN \n", "104914594 Phil_Mattingly Mattingly, Phil CNN \n", "118130765 dylanlscott Scott, Dylan L. Stat News \n", "16187637 ChadPergram Pergram, Chad Fox News \n", "12354832 kasie Hunt, Kasie NBC News \n", "19847765 sahilkapur Kapur, Sahil Bloomberg News \n", "167024520 rachaelmbade Bade, Rachel M. Politico \n", "21252618 JakeSherman Sherman, Jacob S. Politico \n", "22891564 chrisgeidner Geidner, Chris BuzzFeed \n", "70511174 Hadas_Gold Gold, Hadas Politico \n", "22771961 Acosta Acosta, Jim CNN \n", "139738464 mj_lee Lee, MJ CNN \n", "217550862 BresPolitico Bresnahan, John Politico \n", "61734492 Fahrenthold Fahrenthold, David Washington Post \n", "\n", " gender followers_count retweet_count retweeting_count \n", "user_id \n", "407013776 M 31010 748.00 122.00 \n", "593813785 F 5894 704.00 9.00 \n", "19186003 F 33980 572.00 142.00 \n", "31127446 M 301474 549.00 140.00 \n", "21316253 M 198517 516.00 149.00 \n", "46557945 M 55762 503.00 97.00 \n", "14007532 M 39798 470.00 140.00 \n", "19107878 M 308181 463.00 165.00 \n", "33653195 F 14049 452.00 119.00 \n", "398088661 M 77919 447.00 116.00 \n", "39155029 M 88366 403.00 132.00 \n", "14529929 M 1305680 388.00 158.00 \n", "104914594 M 40119 372.00 129.00 \n", "118130765 M 20122 367.00 67.00 \n", "16187637 M 59305 365.00 122.00 \n", "12354832 F 187357 344.00 164.00 \n", "19847765 M 69086 338.00 103.00 \n", "167024520 F 30164 303.00 59.00 \n", "21252618 M 81762 302.00 106.00 \n", "22891564 M 83316 287.00 61.00 \n", "70511174 F 45221 279.00 111.00 \n", "22771961 M 350650 265.00 119.00 \n", "139738464 F 31940 259.00 79.00 \n", "217550862 M 40562 256.00 82.00 \n", "61734492 M 451778 253.00 115.00 " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweeted_by_female_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])\n", "journalists_retweeted_by_female_summary_df.to_csv('output/journalists_retweeted_by_female_journalists.csv')\n", "journalists_retweeted_by_female_summary_df[journalist_retweet_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of female journalists retweeting other journalists, how many are male / female?\n", "Average is of female journalists retweeting other journalists, how many retweets does each male / female journalist receive." ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_retweets
index
M2541059.6%19.56
F1722840.4%17.35
\n", "
" ], "text/plain": [ " count percentage avg_retweets\n", "index \n", "M 25410 59.6% 19.56\n", "F 17228 40.4% 17.35" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'F'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times do female journalists retweet male / female / all journalists?\n", "That is, retweets per female journalist. " ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FMall
count993.00993.00993.00
mean17.3525.5942.94
std45.3474.55113.79
min0.000.000.00
25%0.001.002.00
50%4.006.0010.00
75%16.0022.0039.00
max857.001,779.002,385.00
\n", "
" ], "text/plain": [ " F M all\n", "count 993.00 993.00 993.00\n", "mean 17.35 25.59 42.94\n", "std 45.34 74.55 113.79\n", "min 0.00 0.00 0.00\n", "25% 0.00 1.00 2.00\n", "50% 4.00 6.00 10.00\n", "75% 16.00 22.00 39.00\n", "max 857.00 1,779.00 2,385.00" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "female_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'F']\n", "female_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'F'], female_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']]\n", "female_journalists_retweet_by_gender_df.fillna(0, inplace=True)\n", "female_journalists_retweet_by_gender_df['all'] = female_journalists_retweet_by_gender_df.F + female_journalists_retweet_by_gender_df.M\n", "female_journalists_retweet_by_gender_df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Male journalists retweeting other journalists" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of male journalists retweeting other journalists, who is retweeted the most?" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
screen_namenameorganizationgenderfollowers_countretweet_countretweeting_count
user_id
21316253ZekeJMillerMiller, Zeke J.Time MagazineM1985171,207.00238.00
19107878GlennThrushThrush, Glenn H.New York TimesM3081811,114.00286.00
407013776burgessevEverett, John B.PoliticoM310101,088.00167.00
14529929jaketapperTapper, JakeCNNM13056801,071.00239.00
13524182daveweigelWeigel, DavidWashington PostM332344975.00209.00
39155029mkrajuRaju, Manu K.CNNM88366956.00209.00
46557945StevenTDennisDennis, Steven T.Bloomberg NewsM55762900.00183.00
398088661MEPFullerFuller, Matt E.Huffington PostM77919877.00170.00
19847765sahilkapurKapur, SahilBloomberg NewsM69086848.00193.00
16006592BenjySarlinSarlin, BenjaminNBC NewsM78075828.00141.00
19186003seungminkimKim, Seung MinPoliticoF33980821.00185.00
16187637ChadPergramPergram, ChadFox NewsM59305812.00175.00
31127446markknollerKnoller, MarkCBS NewsM301474794.00201.00
259395895JohnJHarwoodHarwood, JohnCNBCM149040777.00196.00
104914594Phil_MattinglyMattingly, PhilCNNM40119748.00185.00
14007532frankthorpThorp, FrankNBC NewsM39798737.00194.00
18678924jmartNYTMartin, JonathanNew York TimesM197322726.00167.00
21252618JakeShermanSherman, Jacob S.PoliticoM81762641.00175.00
104299137DavidMDruckerDrucker, DavidWashington ExaminerM35033583.00127.00
70511174Hadas_GoldGold, HadasPoliticoF45221570.00195.00
12354832kasieHunt, KasieNBC NewsF187357565.00224.00
22771961AcostaAcosta, JimCNNM350650564.00196.00
19580890LeeCampCamp, LeeRTTV AmericaM67601560.006.00
3817401ericgellerGeller, EricPoliticoM58173524.00149.00
22129280jimsciuttoSciutto, JamesCNNM172012507.00151.00
\n", "
" ], "text/plain": [ " screen_name name organization gender \\\n", "user_id \n", "21316253 ZekeJMiller Miller, Zeke J. Time Magazine M \n", "19107878 GlennThrush Thrush, Glenn H. New York Times M \n", "407013776 burgessev Everett, John B. Politico M \n", "14529929 jaketapper Tapper, Jake CNN M \n", "13524182 daveweigel Weigel, David Washington Post M \n", "39155029 mkraju Raju, Manu K. CNN M \n", "46557945 StevenTDennis Dennis, Steven T. Bloomberg News M \n", "398088661 MEPFuller Fuller, Matt E. Huffington Post M \n", "19847765 sahilkapur Kapur, Sahil Bloomberg News M \n", "16006592 BenjySarlin Sarlin, Benjamin NBC News M \n", "19186003 seungminkim Kim, Seung Min Politico F \n", "16187637 ChadPergram Pergram, Chad Fox News M \n", "31127446 markknoller Knoller, Mark CBS News M \n", "259395895 JohnJHarwood Harwood, John CNBC M \n", "104914594 Phil_Mattingly Mattingly, Phil CNN M \n", "14007532 frankthorp Thorp, Frank NBC News M \n", "18678924 jmartNYT Martin, Jonathan New York Times M \n", "21252618 JakeSherman Sherman, Jacob S. Politico M \n", "104299137 DavidMDrucker Drucker, David Washington Examiner M \n", "70511174 Hadas_Gold Gold, Hadas Politico F \n", "12354832 kasie Hunt, Kasie NBC News F \n", "22771961 Acosta Acosta, Jim CNN M \n", "19580890 LeeCamp Camp, Lee RTTV America M \n", "3817401 ericgeller Geller, Eric Politico M \n", "22129280 jimsciutto Sciutto, James CNN M \n", "\n", " followers_count retweet_count retweeting_count \n", "user_id \n", "21316253 198517 1,207.00 238.00 \n", "19107878 308181 1,114.00 286.00 \n", "407013776 31010 1,088.00 167.00 \n", "14529929 1305680 1,071.00 239.00 \n", "13524182 332344 975.00 209.00 \n", "39155029 88366 956.00 209.00 \n", "46557945 55762 900.00 183.00 \n", "398088661 77919 877.00 170.00 \n", "19847765 69086 848.00 193.00 \n", "16006592 78075 828.00 141.00 \n", "19186003 33980 821.00 185.00 \n", "16187637 59305 812.00 175.00 \n", "31127446 301474 794.00 201.00 \n", "259395895 149040 777.00 196.00 \n", "104914594 40119 748.00 185.00 \n", "14007532 39798 737.00 194.00 \n", "18678924 197322 726.00 167.00 \n", "21252618 81762 641.00 175.00 \n", "104299137 35033 583.00 127.00 \n", "70511174 45221 570.00 195.00 \n", "12354832 187357 565.00 224.00 \n", "22771961 350650 564.00 196.00 \n", "19580890 67601 560.00 6.00 \n", "3817401 58173 524.00 149.00 \n", "22129280 172012 507.00 151.00 " ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalists_retweeted_by_male_summary_df = journalist_retweet_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])\n", "journalists_retweeted_by_male_summary_df.to_csv('output/journalists_retweeted_by_male_journalists.csv')\n", "journalists_retweeted_by_male_summary_df[journalist_retweet_summary_fields].head(25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Of male journalists retweeting other journalists, how many are male / female?\n", "Average is of male journalists retweeting other journalists, how many retweets does each male / female journalist receive." ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpercentageavg_retweets
index
M5522474.2%42.51
F1918625.8%19.32
\n", "
" ], "text/plain": [ " count percentage avg_retweets\n", "index \n", "M 55224 74.2% 42.51\n", "F 19186 25.8% 19.32" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journalist_retweet_gender_summary(journalists_retweet_df[journalists_retweet_df.gender == 'M'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### On average, how many times do male journalists retweet male / female / all journalists?\n", "That is, retweets per male journalist. " ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FMall
count1,299.001,299.001,299.00
mean14.7742.5157.28
std33.50106.87136.92
min0.000.000.00
25%0.001.001.00
50%3.007.0011.00
75%14.0035.0050.00
max442.001,414.001,766.00
\n", "
" ], "text/plain": [ " F M all\n", "count 1,299.00 1,299.00 1,299.00\n", "mean 14.77 42.51 57.28\n", "std 33.50 106.87 136.92\n", "min 0.00 0.00 0.00\n", "25% 0.00 1.00 1.00\n", "50% 3.00 7.00 11.00\n", "75% 14.00 35.00 50.00\n", "max 442.00 1,414.00 1,766.00" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "male_journalists_retweet_df = journalists_retweet_df[journalists_retweet_df.gender == 'M']\n", "male_journalists_retweet_by_gender_df = pd.merge(user_summary_df[user_summary_df.gender == 'M'], male_journalists_retweet_df.groupby(['user_id', 'retweet_gender']).size().unstack(), how='left', left_index=True, right_index=True)[['F', 'M']]\n", "male_journalists_retweet_by_gender_df.fillna(0, inplace=True)\n", "male_journalists_retweet_by_gender_df['all'] = male_journalists_retweet_by_gender_df.F + male_journalists_retweet_by_gender_df.M\n", "male_journalists_retweet_by_gender_df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" }, "toc": { "nav_menu": { "height": "512px", "width": "252px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "toc_cell": true, "toc_position": { "height": "674px", "left": "0px", "right": "1254px", "top": "112px", "width": "282px" }, "toc_section_display": "block", "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }