Alessandro D. Gagliardi
# standard library:
import os
from pprint import pprint
# other modules:
import matplotlib.pyplot as plt
import pandas as pd
import twitter
import yaml
from pymongo import MongoClient
credentials = yaml.load(open(os.path.expanduser('~/api_cred.yml')))
auth = twitter.oauth.OAuth(credentials['ACCESS_TOKEN'],
credentials['ACCESS_TOKEN_SECRET'],
credentials['API_KEY'],
credentials['API_SECRET'])
twitter_api = twitter.Twitter(auth=auth)
# The Yahoo! Where On Earth ID for the entire world is 1.
# See https://dev.twitter.com/docs/api/1.1/get/trends/place and
# http://developer.yahoo.com/geo/geoplanet/
WORLD_WOE_ID = 1
US_WOE_ID = 23424977
# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special case keyword argument.
world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
us_trends = twitter_api.trends.place(_id=US_WOE_ID)
pprint(world_trends)
[{u'as_of': u'2014-04-20T02:17:08Z', u'created_at': u'2014-04-20T02:12:36Z', u'locations': [{u'name': u'Worldwide', u'woeid': 1}], u'trends': [{u'name': u'#SiempreMePasaEstoDe', u'promoted_content': None, u'query': u'%23SiempreMePasaEstoDe', u'url': u'http://twitter.com/search?q=%23SiempreMePasaEstoDe'}, {u'name': u'#MahoganyRT', u'promoted_content': None, u'query': u'%23MahoganyRT', u'url': u'http://twitter.com/search?q=%23MahoganyRT'}, {u'name': u'#ClanessaNoAltasHoras', u'promoted_content': None, u'query': u'%23ClanessaNoAltasHoras', u'url': u'http://twitter.com/search?q=%23ClanessaNoAltasHoras'}, {u'name': u'Happy Easter Sunday', u'promoted_content': None, u'query': u'%22Happy+Easter+Sunday%22', u'url': u'http://twitter.com/search?q=%22Happy+Easter+Sunday%22'}, {u'name': u'#KeepUpIndonesia', u'promoted_content': None, u'query': u'%23KeepUpIndonesia', u'url': u'http://twitter.com/search?q=%23KeepUpIndonesia'}, {u'name': u'#EsDePocoHombre', u'promoted_content': None, u'query': u'%23EsDePocoHombre', u'url': u'http://twitter.com/search?q=%23EsDePocoHombre'}, {u'name': u'Mark Herberholz', u'promoted_content': None, u'query': u'%22Mark+Herberholz%22', u'url': u'http://twitter.com/search?q=%22Mark+Herberholz%22'}, {u'name': u'Jesus is Alive', u'promoted_content': None, u'query': u'%22Jesus+is+Alive%22', u'url': u'http://twitter.com/search?q=%22Jesus+is+Alive%22'}, {u'name': u'30-27 Werdum', u'promoted_content': None, u'query': u'%2230-27+Werdum%22', u'url': u'http://twitter.com/search?q=%2230-27+Werdum%22'}, {u'name': u'1458 Espectadores', u'promoted_content': None, u'query': u'%221458+Espectadores%22', u'url': u'http://twitter.com/search?q=%221458+Espectadores%22'}]}]
pprint(us_trends)
[{u'as_of': u'2014-04-20T02:17:09Z', u'created_at': u'2014-04-20T02:12:36Z', u'locations': [{u'name': u'United States', u'woeid': 23424977}], u'trends': [{u'name': u'#MahoganyRT', u'promoted_content': None, u'query': u'%23MahoganyRT', u'url': u'http://twitter.com/search?q=%23MahoganyRT'}, {u'name': u'#romeoplaymmmyeah', u'promoted_content': None, u'query': u'%23romeoplaymmmyeah', u'url': u'http://twitter.com/search?q=%23romeoplaymmmyeah'}, {u'name': u'#ADayLateAndADollarShort', u'promoted_content': None, u'query': u'%23ADayLateAndADollarShort', u'url': u'http://twitter.com/search?q=%23ADayLateAndADollarShort'}, {u'name': u'Westbrook and Durant', u'promoted_content': None, u'query': u'%22Westbrook+and+Durant%22', u'url': u'http://twitter.com/search?q=%22Westbrook+and+Durant%22'}, {u'name': u'#WerdumvsBrowne', u'promoted_content': None, u'query': u'%23WerdumvsBrowne', u'url': u'http://twitter.com/search?q=%23WerdumvsBrowne'}, {u'name': u'#TenCommandments', u'promoted_content': None, u'query': u'%23TenCommandments', u'url': u'http://twitter.com/search?q=%23TenCommandments'}, {u'name': u'Travis Browne', u'promoted_content': None, u'query': u'%22Travis+Browne%22', u'url': u'http://twitter.com/search?q=%22Travis+Browne%22'}, {u'name': u'Tomorrow is Easter', u'promoted_content': None, u'query': u'%22Tomorrow+is+Easter%22', u'url': u'http://twitter.com/search?q=%22Tomorrow+is+Easter%22'}, {u'name': u'The Devil Wears Prada', u'promoted_content': None, u'query': u'%22The+Devil+Wears+Prada%22', u'url': u'http://twitter.com/search?q=%22The+Devil+Wears+Prada%22'}, {u'name': u'Derek Fisher', u'promoted_content': None, u'query': u'%22Derek+Fisher%22', u'url': u'http://twitter.com/search?q=%22Derek+Fisher%22'}]}]
world_trends_set = set([trend['name'] for trends in world_trends
for trend in trends['trends']])
us_trends_set = set([trend['name'] for trends in us_trends
for trend in trends['trends']])
world_trends_set.intersection(us_trends_set)
{u'#MahoganyRT'}
q = world_trends[0]['trends'][0]['name']
count = 100
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']
len(statuses)
100
search_results['search_metadata'].get('next_results')
u'?max_id=457704145387257855&q=%23SiempreMePasaEstoDe&count=100&include_entities=1'
if 'next_results' in search_results['search_metadata']:
next_results = search_results['search_metadata']['next_results']
kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
len(statuses)
200
from the command line:
$ mkdir -p data/db
$ mongod --dbpath data/db
c = MongoClient()
twitter
:¶db = c.twitter
tweets
:¶statuses_ids = db.tweets.insert(statuses)
statuses_ids[:5]
[ObjectId('53532e26bcdefe1199f65732'), ObjectId('53532e26bcdefe1199f65733'), ObjectId('53532e26bcdefe1199f65734'), ObjectId('53532e26bcdefe1199f65735'), ObjectId('53532e26bcdefe1199f65736')]
ObjectId
s¶ObjectId is a 12-byte BSON type, constructed using:
In MongoDB, documents stored in a collection require a unique _id
field that acts as a primary key. Because ObjectIds are small, most likely unique, and fast to generate, MongoDB uses ObjectIds as the default value for the _id
field if the _id
field is not specified.
c.database_names()
[u'local', u'twitter']
db = c.twitter
db.collection_names()
[u'system.indexes', u'tweets', u'canada', u'us']
db.tweets.find_one()
{u'_id': ObjectId('53531654bcdefe0e658cbf07'), u'contributors': None, u'coordinates': None, u'created_at': u'Sun Apr 20 00:34:04 +0000 2014', u'entities': {u'hashtags': [{u'indices': [11, 22], u'text': u'NashTo2Mil'}, {u'indices': [23, 34], u'text': u'NashTo2Mil'}, {u'indices': [35, 46], u'text': u'Nash2ToMil'}, {u'indices': [47, 58], u'text': u'NashTo2Mil'}, {u'indices': [59, 70], u'text': u'NashTo2Mil'}, {u'indices': [71, 82], u'text': u'Nash2ToMil'}, {u'indices': [83, 94], u'text': u'NashTo2Mil'}, {u'indices': [95, 106], u'text': u'NashTo2Mil'}, {u'indices': [107, 118], u'text': u'Nash2ToMil'}], u'symbols': [], u'urls': [], u'user_mentions': [{u'id': 310072711, u'id_str': u'310072711', u'indices': [0, 10], u'name': u'Nash', u'screen_name': u'Nashgrier'}]}, u'favorite_count': 0, u'favorited': False, u'geo': None, u'id': 457678549789708288L, u'id_str': u'457678549789708288', u'in_reply_to_screen_name': u'Nashgrier', u'in_reply_to_status_id': None, u'in_reply_to_status_id_str': None, u'in_reply_to_user_id': 310072711, u'in_reply_to_user_id_str': u'310072711', u'lang': u'it', u'metadata': {u'iso_language_code': u'it', u'result_type': u'recent'}, u'place': None, u'retweet_count': 0, u'retweeted': False, u'source': u'<a href="https://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', u'text': u'@Nashgrier #NashTo2Mil #NashTo2Mil #Nash2ToMil #NashTo2Mil #NashTo2Mil #Nash2ToMil #NashTo2Mil #NashTo2Mil #Nash2ToMil x85', u'truncated': False, u'user': {u'contributors_enabled': False, u'created_at': u'Fri Aug 16 23:27:37 +0000 2013', u'default_profile': True, u'default_profile_image': False, u'description': u'one direction \u2665_\u2665', u'entities': {u'description': {u'urls': []}}, u'favourites_count': 30, u'follow_request_sent': False, u'followers_count': 61, u'following': False, u'friends_count': 86, u'geo_enabled': False, u'id': 1676879275, u'id_str': u'1676879275', u'is_translation_enabled': False, u'is_translator': False, u'lang': u'en', u'listed_count': 1, u'location': u'ireland', u'name': u'niamh', u'notifications': False, u'profile_background_color': u'C0DEED', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png', u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png', u'profile_background_tile': False, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/1676879275/1393193679', u'profile_image_url': u'http://pbs.twimg.com/profile_images/437708680042405889/1Loj3lOe_normal.jpeg', u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/437708680042405889/1Loj3lOe_normal.jpeg', u'profile_link_color': u'0084B4', u'profile_sidebar_border_color': u'C0DEED', u'profile_sidebar_fill_color': u'DDEEF6', u'profile_text_color': u'333333', u'profile_use_background_image': True, u'protected': False, u'screen_name': u'radicalzniall', u'statuses_count': 377, u'time_zone': u'Atlantic Time (Canada)', u'url': None, u'utc_offset': -10800, u'verified': False}}
Now that we have our data in MongoDB, we can use some of it's search functionality. For example:
popular_tweets = db.tweets.find({'retweet_count': {"$gte": 3}})
popular_tweets.count()
52
pd.DataFrame(db.tweets.find(fields=['created_at', 'retweet_count', 'favorite_count']))
--------------------------------------------------------------------------- PandasError Traceback (most recent call last) <ipython-input-17-87a1cfed5405> in <module>() ----> 1 pd.DataFrame(db.tweets.find(fields=['created_at', 'retweet_count', 'favorite_count'])) /Users/alessandro.gagliardi/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy) 271 copy=False) 272 else: --> 273 raise PandasError('DataFrame constructor not properly called!') 274 275 NDFrame.__init__(self, mgr, fastpath=True) PandasError: DataFrame constructor not properly called!
retweet_favorites = pd.DataFrame(list(db.tweets.find(fields=['created_at','retweet_count','favorite_count'])))
retweet_favorites.head()
_id | created_at | favorite_count | retweet_count | |
---|---|---|---|---|
0 | 53531654bcdefe0e658cbf07 | Sun Apr 20 00:34:04 +0000 2014 | 0 | 0 |
1 | 53531654bcdefe0e658cbf08 | Sun Apr 20 00:34:04 +0000 2014 | 0 | 22327 |
2 | 53531654bcdefe0e658cbf09 | Sun Apr 20 00:34:04 +0000 2014 | 0 | 0 |
3 | 53531654bcdefe0e658cbf0a | Sun Apr 20 00:34:04 +0000 2014 | 0 | 0 |
4 | 53531654bcdefe0e658cbf0b | Sun Apr 20 00:34:04 +0000 2014 | 0 | 0 |
5 rows × 4 columns
.describe()
is a useful method to get the gist of our data.
retweet_favorites.describe()
favorite_count | retweet_count | |
---|---|---|
count | 400.0000 | 400.000000 |
mean | 0.0150 | 125.025000 |
std | 0.1408 | 1578.105185 |
min | 0.0000 | 0.000000 |
25% | 0.0000 | 0.000000 |
50% | 0.0000 | 0.000000 |
75% | 0.0000 | 1.000000 |
max | 2.0000 | 22327.000000 |
8 rows × 2 columns
However, when applied to a DataFrame, it only describes numeric columns.
retweet_favorites.dtypes
_id object created_at object favorite_count int64 retweet_count int64 dtype: object
.describe()
can be called on individual columns (i.e. Series
), even if they are not numeric.
retweet_favorites.created_at.describe()
count 400 unique 117 top Sun Apr 20 00:33:55 +0000 2014 freq 16 Name: created_at, dtype: object
However, in this case created_at
is being treated as a string, which is not very helpful. We can fix that with pandas.to_datetime
:
retweet_favorites.created_at.map(pd.to_datetime).describe()
count 400 unique 117 first 2014-04-20 00:33:45 last 2014-04-20 02:16:53 top 2014-04-20 00:33:55 freq 16 Name: created_at, dtype: object
Mongo allows us to access subfields directly.
mentions_followers = list(db.tweets.find(fields=['entities.user_mentions', 'user.followers_count']))
pd.DataFrame(mentions_followers).head()
_id | entities | user | |
---|---|---|---|
0 | 53531654bcdefe0e658cbf07 | {u'user_mentions': [{u'indices': [0, 10], u'sc... | {u'followers_count': 61} |
1 | 53531654bcdefe0e658cbf08 | {u'user_mentions': [{u'indices': [3, 13], u'sc... | {u'followers_count': 635} |
2 | 53531654bcdefe0e658cbf09 | {u'user_mentions': [{u'indices': [0, 10], u'sc... | {u'followers_count': 32} |
3 | 53531654bcdefe0e658cbf0a | {u'user_mentions': [{u'indices': [0, 10], u'sc... | {u'followers_count': 14} |
4 | 53531654bcdefe0e658cbf0b | {u'user_mentions': []} | {u'followers_count': 18} |
5 rows × 3 columns
Pandas doesn't know how to parse the sub-documents however, so we must tell it explicitly:
mentions_followers_df = pd.DataFrame({'user_mentions': len(tweet['entities']['user_mentions']),
'followers_count': tweet['user'].get('followers_count')} for tweet in mentions_followers)
mentions_followers_df.head()
followers_count | user_mentions | |
---|---|---|
0 | 61 | 1 |
1 | 635 | 1 |
2 | 32 | 1 |
3 | 14 | 1 |
4 | 18 | 0 |
5 rows × 2 columns
Perhaps user_mentions
and followers_count
are correlated?
plt.scatter(mentions_followers_df.user_mentions, mentions_followers_df.followers_count)
mentions_followers_df.corr()
followers_count | user_mentions | |
---|---|---|
followers_count | 1.000000 | -0.065969 |
user_mentions | -0.065969 | 1.000000 |
2 rows × 2 columns
Perhaps not.
The Yahoo! Where On Earth ID of Canada is 23424775.
Use it to find Twitter trends in Canada and compare it to US trends. What's the difference between them?
canada
collection. us
collections. follower_count
?Find tweets where 'retweet_count': {"$gt": 0}
.
What is their count
?
Is there anything about them that stands out as retweetable?