{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "'''\n", "\n", "This script will scrape tweets from the 'following' list of twitter accounts that was output of 00_Scrape_Following.ipynb.\n", "A csv is created for tweets from every account and saved as username_date.csv \n", "\n", "\n", "References:\n", "div by 1000 - https://stackoverflow.com/questions/37494983/python-fromtimestamp-oserror\n", "\n", "'''\n", "\n", "#Import required libraries\n", "import time\n", "from selenium import webdriver\n", "from selenium.webdriver.common.keys import Keys\n", "from datetime import datetime\n", "from pandas import DataFrame as df\n", "from datetime import datetime\n", "import pandas as pd\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_driver():\n", " '''\n", " \n", " This function will create a headless browser driver object that we'll use to scrape data automatically\n", " \n", " '''\n", " \n", " #Initialize options\n", " options = webdriver.ChromeOptions()\n", " #Pass in headless argument to options\n", " options.add_argument('--headless')\n", " #Initialize driver\n", " driver = webdriver.Chrome('chromedriver.exe',chrome_options=options)\n", " return driver" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def scrape_user_tweets(browser,twitter_username,no_of_pagedowns):\n", " '''\n", " \n", " This function will \n", " - open the twitter account page\n", " - scroll down\n", " - scrape tweets and save in the form username_date.csv\n", " \n", " '''\n", "\n", " browser.get(\"https://twitter.com/\" + twitter_username)\n", " time.sleep(1)\n", " \n", " print('Scraping '+twitter_username)\n", "\n", " \n", " elem = browser.find_element_by_tag_name(\"body\")\n", " no_of_pagedowns = no_of_pagedowns\n", " while no_of_pagedowns:\n", " elem.send_keys(Keys.PAGE_DOWN)\n", " time.sleep(0.2)\n", " no_of_pagedowns-=1 \n", "\n", " twitter_elm = browser.find_elements_by_class_name(\"tweet\")\n", "\n", " #We'll iterate over tweets found & keep appending tweet info in this list\n", " tweet_details=[]\n", "\n", " for post in twitter_elm:\n", " #print(post)\n", " username = post.find_element_by_class_name(\"username\")\n", " #print(username.text)\n", " tweet_username=username.text\n", "\n", " tweet = post.find_element_by_class_name(\"tweet-text\")\n", " #print(tweet.text)\n", " tweet_text=tweet.text\n", "\n", " tweet_hashtag_mentions=[hashtag.text for hashtag in tweet.find_elements_by_class_name(\"twitter-hashtag\")]\n", "\n", " rt_count=post.find_elements_by_class_name(\"ProfileTweet-actionCount\")\n", "\n", " tweet_comment_count=rt_count[1].get_attribute(\"data-tweet-stat-count\")\n", " tweet_retweet_count=rt_count[2].get_attribute(\"data-tweet-stat-count\")\n", " tweet_fav_count=rt_count[3].get_attribute(\"data-tweet-stat-count\")\n", "\n", "\n", " time_stp=int(post.find_elements_by_class_name(\"_timestamp\")[0]\n", " .get_attribute(\"data-time-ms\"))\n", " utc_time = datetime.fromtimestamp(time_stp/1000)\n", "\n", " tweet_date_posted=utc_time.strftime(\"%Y-%m-%d %H:%M:%S.%f+00:00 (UTC)\")\n", "\n", " pic=post.find_elements_by_class_name('js-adaptive-photo')\n", "\n", " tweet_img_url=pic[0].get_attribute('data-image-url') if pic else False\n", " #print(img_url)\n", "\n", " tweet_details.append({\n", " 'tweet_username':username.text\n", " ,'tweet_text': tweet_text\n", " ,'tweet_hashtag_mentions':tweet_hashtag_mentions\n", " ,'tweet_comment_count':tweet_comment_count\n", " ,'tweet_retweet_count':tweet_retweet_count\n", " ,'tweet_fav_count':tweet_fav_count\n", " ,'tweet_img_url':tweet_img_url\n", " })\n", " \n", " #Create dataframe from tweets scraped for the account. Store them in csv \n", " scraped_tweets=pd.DataFrame(tweet_details)\n", " scraped_tweets.to_csv(twitter_username+\"_\"+datetime.today().strftime('%Y%m%d')+\".csv\",index=False )\n", " print('Scraped '+twitter_username)\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Initiate driver object\n", "browser = get_driver()\n", "\n", "#Read in the list of funny accounts and scrape one by one\n", "good_accounts=pd.read_csv('Following list.csv')\n", "\n", "for index,row in good_accounts.iterrows():\n", " #print(row['Username'])\n", " scrape_user_tweets(browser,row['Username'],10)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }