{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "51565" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import pymongo module\n", "from pymongo import MongoClient\n", "\n", "# Create client, an instance and the collection\n", "client = MongoClient('localhost', 27017)\n", "db = client['mgo_db']\n", "collection = db['time100_collection']\n", "\n", "# Show how many documents in the collection\n", "collection.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NOTE: THE PREVIOUS BLOCK OF CELL CAN BE COMMENTED OUT WHEN IT IS NOT NECESSARY. IT NEED NOT TO BE RAN EACH TIME UNLESS YOU WANT TO GET THE LATEST TWEETS." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Import pandas module\n", "# import pandas as pd\n", "\n", "# # Load all the collection into \n", "# tweets = collection.find()\n", "\n", "# # Import json_normalize and use this to normalize or flatten the JSON into a pandas dataframe\n", "# from pandas.io.json import json_normalize\n", "# df_tweets = json_normalize(tweets)\n", "\n", "# # Leave out the noise\n", "# df_tweets = df_tweets[['created_at', 'text', 'favorite_count', 'retweet_count', 'entities.hashtags','lang','user.screen_name',\n", "# 'user.name', 'user.followers_count', 'user.friends_count', 'user.time_zone', 'user.location', \n", "# 'user.verified', 'user.profile_image_url', 'coordinates', 'coordinates.type', 'coordinates.coordinates',\n", "# 'place', 'place.place_type', 'place.name', 'place.full_name', 'place.country_code', 'place.country',\n", " \n", "# 'retweeted_status.created_at', 'retweeted_status.text', 'retweeted_status.favorite_count', \n", "# 'retweeted_status.retweet_count', 'retweeted_status.entities.hashtags', 'retweeted_status.lang',\n", "# 'retweeted_status.user.screen_name', 'retweeted_status.user.name', 'retweeted_status.user.followers_count', \n", "# 'retweeted_status.user.friends_count', 'retweeted_status.user.time_zone', 'retweeted_status.user.location',\n", "# 'retweeted_status.coordinates', 'retweeted_status.coordinates.type', 'retweeted_status.coordinates.coordinates', \n", "# 'retweeted_status.place', 'retweeted_status.place.place_type', 'retweeted_status.place.name', \n", "# 'retweeted_status.place.full_name', 'retweeted_status.place.country_code', 'retweeted_status.place.country',\n", " \n", "# 'quoted_status.created_at', 'quoted_status.text', 'quoted_status.favorite_count', \n", "# 'quoted_status.retweet_count', 'quoted_status.entities.hashtags', 'quoted_status.lang',\n", "# 'quoted_status.user.screen_name', 'quoted_status.user.name', 'quoted_status.user.followers_count', \n", "# 'quoted_status.user.friends_count', 'quoted_status.user.time_zone', 'quoted_status.user.location',\n", "# 'quoted_status.coordinates', 'quoted_status.coordinates.type', 'quoted_status.coordinates.coordinates', \n", "# 'quoted_status.place', 'quoted_status.place.place_type', 'quoted_status.place.name', \n", "# 'quoted_status.place.full_name', 'quoted_status.place.country_code', 'quoted_status.place.country'\n", "# ]]\n", "\n", "### SAVE A COPY IN CSV, SO THAT WE CAN COMMENT OUT AND SKIP THE PREVIOUS STEPS WHEN IT IS NOT NECESSARY. \n", "# df_tweets.to_csv('data/time100_tweets_0430_730PM.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NOTE: THE PREVIOUS BLOCK OF CELL CAN BE COMMENTED OUT WHEN IT IS NOT NECESSARY. IT NEED NOT TO BE RAN EACH TIME UNLESS YOU RAN THE FIRST BLOCK." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textretweeted_status.textquoted_status.text
0RT @BillGates: Melinda’s work on behalf of wom...Melinda’s work on behalf of women and girls in...NaN
1RT @TIME: Colin Kaepernick’s “willingness to t...Colin Kaepernick’s “willingness to take a posi...NaN
2WWVDD? What Would Viola Davis Do?! Congratulat...NaNMeryl Streep on @ViolaDavis: “Her gifts as an ...
3RT @RitaPanahi: You're honoured to write about...You're honoured to write about an anti-Semitic...The four exceptional organizers of the @womens...
4RT @TIME: See why Thailand's new king Maha Vaj...See why Thailand's new king Maha Vajiralongkor...NaN
\n", "
" ], "text/plain": [ " text \\\n", "0 RT @BillGates: Melinda’s work on behalf of wom... \n", "1 RT @TIME: Colin Kaepernick’s “willingness to t... \n", "2 WWVDD? What Would Viola Davis Do?! Congratulat... \n", "3 RT @RitaPanahi: You're honoured to write about... \n", "4 RT @TIME: See why Thailand's new king Maha Vaj... \n", "\n", " retweeted_status.text \\\n", "0 Melinda’s work on behalf of women and girls in... \n", "1 Colin Kaepernick’s “willingness to take a posi... \n", "2 NaN \n", "3 You're honoured to write about an anti-Semitic... \n", "4 See why Thailand's new king Maha Vajiralongkor... \n", "\n", " quoted_status.text \n", "0 NaN \n", "1 NaN \n", "2 Meryl Streep on @ViolaDavis: “Her gifts as an ... \n", "3 The four exceptional organizers of the @womens... \n", "4 NaN " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "df_tweets = pd.read_csv('data/time100_tweets_0430_530PM.csv', encoding ='latin1', low_memory=False)\n", "# Since we are only after the mentions of the Time 100 names in the tweets, select only the text \n", "# (or tweets or re-tweets or quated tweets. I might as well include all of these for larger data)\n", "df_tweets = df_tweets[['text', 'retweeted_status.text', 'quoted_status.text']]\n", "df_tweets.head()\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import NLTK for processing tweets\n", "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "import string\n", "import re\n", "\n", "# We also parse emoticon as word, but since we are only after the names, this will just be ignored.\n", "emoticons_str = r\"\"\"\n", " (?:\n", " [:=;] # Eyes\n", " [oO\\-]? # Nose (optional)\n", " [D\\)\\]\\(\\]/\\\\OpP] # Mouth\n", " )\"\"\"\n", "\n", "# As said, since we are only after the names, this will just be ignored.\n", "regex_str = [\n", " emoticons_str,\n", " r'<[^>]+>', # HTML tags\n", " r'(?:@[\\w_]+)', # @-mentions\n", " r\"(?:\\#+[\\w_]+[\\w\\'_\\-]*[\\w_]+)\", # hash-tags\n", " r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs \n", " r'https://t.co/(?:[a-z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs \n", " r'https://t.co/(?:[0-9]|[a-z]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs \n", " r'(?:(?:\\d+,?)+(?:\\.?\\d+)?)', # numbers\n", " r\"(?:[a-z][a-z'\\-_]+[a-z])\", # words with - and '\n", " r'(?:[\\w_]+)', # other words\n", " r'(?:\\S)' # anything else\n", "] \n", "tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)\n", " \n", "def tokenize(s):\n", " return tokens_re.findall(s)\n", " \n", "def preprocess(s, lowercase=False):\n", " tokens = tokenize(s)\n", " if lowercase:\n", " tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]\n", " return tokens\n", "\n", "punctuation = list(string.punctuation)\n", "\n", "\n", "# stop = stopwords.words('english')\n", "stop = [] # Make is empty. Stopwords will also be ignore for now becasue we are only after the names" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NOTE: IN THE PREVIOUS BLOCK OF CELL, THERE ARE NLTK PREPROCOSSING VARIABLES THAT WILL BE IGNORED SINCE WE ARE ONLY MAKING A WORD CLOUD OF NAMES. \n", "\n", "ALSO, WE NEED NOT TO WORRY ABOUT STOPWORDS FOR NOW. UNCOMMENT THE ASSINGMENT OF stop WHEN YOU NEED TO GET THE ACTUAL WORDS IN THE TWEETS." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import urlopen and BeautifulSoup modules\n", "from urllib.request import urlopen\n", "from bs4 import BeautifulSoup\n", "\n", "# Load the online page (where the list of names are listed, into an html object)\n", "html = urlopen(\"http://time.com/collection/2017-time-100/\")\n", "\n", "# Using BeautifulSoup, parse the html\n", "soup = BeautifulSoup(html, \"html.parser\")\n", "\n", "# Local the tags where the names are listed and loop through each of them to get the names\n", "tDivs = ('211','233', '252', '278', '294')\n", "time100s = []\n", "for tDiv in tDivs:\n", " aTags = soup.find('div', attrs={'data-reactid':tDiv}).find_all('a', href=True)\n", " for aTag in aTags:\n", " time100s.append(aTag.text)\n", "\n", "# Split the names by separating the first and last, or any combincation of names.\n", "inlist = []\n", "for time100 in time100s:\n", " for name in time100.split():\n", " inlist.append(name)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['Samantha Bee',\n", " 'Chance the Rapper',\n", " 'Constance Wu',\n", " 'Gavin Grimm',\n", " 'Kirsten Green',\n", " 'Bob Ferguson',\n", " 'Ivanka Trump',\n", " 'Demis Hassabis',\n", " 'Barbara Lynch',\n", " 'Hamdi Ulukaya',\n", " 'Jared Kushner',\n", " 'Celina Turchi',\n", " 'Jordan Peele',\n", " 'Glenda Gray',\n", " 'Yuriko Koike',\n", " 'Conor McGregor',\n", " 'Riz Ahmed',\n", " 'Guus Velders',\n", " 'Tamika Mallory, Bob Bland, Carmen Perez and Linda Sarsour',\n", " 'Natalie Batalha, Guillem Anglada-Escudé and Michaël Gillon',\n", " 'Emma Stone',\n", " 'Colson Whitehead',\n", " 'Ed Sheeran',\n", " 'Alicia Keys',\n", " 'Ryan Reynolds',\n", " 'Donald Glover',\n", " 'Leslie Jones',\n", " 'Ben Platt',\n", " 'Ava DuVernay',\n", " 'Barry Jenkins',\n", " 'Margot Robbie',\n", " 'Sarah Paulson',\n", " 'James Corden',\n", " 'John Legend',\n", " 'Alessandro Michele',\n", " 'Kerry James Marshall',\n", " 'Demi Lovato',\n", " 'Theresa May',\n", " 'Narendra Modi',\n", " 'Chuck Schumer',\n", " 'Donald Trump',\n", " 'Elizabeth Warren',\n", " 'Julian Assange',\n", " 'James Comey',\n", " 'Kim Jong Un',\n", " 'Reince Priebus',\n", " 'Xi Jinping',\n", " 'Rodrigo Duterte',\n", " 'Stephen Bannon',\n", " 'Theo Epstein',\n", " 'Tom Perez',\n", " 'Vladimir Putin',\n", " 'Wang Qishan',\n", " 'Recep Tayyip Erdogan',\n", " \"Sandra Day O'Connor\",\n", " 'Pope Francis',\n", " 'General James Mattis',\n", " 'King Maha Vajiralongkorn',\n", " 'Juan Manuel Santos',\n", " 'Major General Qasem Soleimani',\n", " 'Melinda Gates',\n", " 'Janet Yellen',\n", " 'LeBron James',\n", " 'Daniel Ek',\n", " 'Bernard J. Tyson',\n", " 'Evan Spiegel',\n", " 'George Church',\n", " 'Jean Liu',\n", " 'Tom Brady',\n", " 'James Allison',\n", " 'Rebekah Mercer',\n", " 'Jason Blum',\n", " 'Jeff Bezos',\n", " 'Vijay Shekhar Sharma',\n", " 'Margrethe Vestager',\n", " 'Simone Biles',\n", " 'Ashley Graham',\n", " 'Cindy Sherman',\n", " 'John Lewis',\n", " 'Margaret Atwood',\n", " 'Colin Kaepernick',\n", " 'Jeanette Vizguerra',\n", " 'Neymar',\n", " 'RuPaul',\n", " 'Raf Simons',\n", " 'Biram Dah Abeid',\n", " 'Leila de Lima',\n", " 'David Adjaye',\n", " 'Gretchen Carlson',\n", " 'Fatou Bensouda',\n", " 'Thelma Aldana',\n", " 'Fan Bingbing',\n", " 'Viola Davis',\n", " 'Raed Saleh',\n", " 'Cindy Arlette Contreras Bautista']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time100s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import pandas and numpy modules\n", "import numpy as np\n", "\n", "# loop through each names\n", "for time100 in time100s:\n", " \n", " # Save or open a text file with filename as their names. \n", " # This file is where all of mentioned name/s in the tweets will be saved in individually. \n", " with open('text/'+time100.replace(',', '').replace('.', '').replace(\"'\", '').replace(' ', '_')+'.txt', \n", " 'w', encoding='latin1') as tt:\n", " \n", " # Now loop through the tweets in the dataframe\n", " for index, row in df_tweets.iterrows():\n", " for name in [name for name in time100.split() if name not in stop]:\n", " if re.findall('\\\\b'+name+'\\\\b', row['text'], flags=re.IGNORECASE):\n", " new_tweet = ' '.join(str(e) for e in [term for term in preprocess(row['text']) \n", " # if term not in stop]) # Only the names we are after, not the stop words for now.\n", " if term in inlist]) # Get only the names in the inlist\n", " tt.write(new_tweet+\" \")\n", "\n", " if row['retweeted_status.text'] != np.nan \\\n", " and row['retweeted_status.text'] != '' \\\n", " and not pd.isnull(row['retweeted_status.text']):\n", " if re.findall('\\\\b'+name+'\\\\b', row['retweeted_status.text'], flags=re.IGNORECASE):\n", " new_tweet = ' '.join(str(e) for e in [term for term in preprocess(row['retweeted_status.text']) \n", " # if term not in stop]) # Only the names we are after, not the stop words for now.\n", " if term in inlist]) # Get only the names in the inlist\n", " tt.write(new_tweet+\" \")\n", "\n", " if row['quoted_status.text'] != np.nan \\\n", " and row['quoted_status.text'] != '' \\\n", " and not pd.isnull(row['quoted_status.text']) \\\n", " and 'https://t.co/' not in row['quoted_status.text']:\n", " if re.findall('\\\\b'+name+'\\\\b', row['quoted_status.text'], flags=re.IGNORECASE):\n", " new_tweet = ' '.join(str(e) for e in [term for term in preprocess(row['quoted_status.text']) \n", " # if term not in stop]) # Only the names we are after, not the stop words for now.\n", " if term in inlist]) # Get only the names in the inlist\n", " tt.write(new_tweet+\" \")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }