{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "'''\n", "\n", "This script will scrape the twitter users that my favourite funny twitter acct(@AbbieEvansXO) herself follows.\n", "This is my hack to get a mostly correct list of funny accounts. (Otherwise, I'd have had to individually find them & then create a list - may be later\n", "We will later scrape the tweets from this 'following' list of twiiter accounts\n", "\n", "'''\n", "\n", "\n", "#Import required libraries\n", "from selenium import webdriver\n", "from selenium.webdriver.common.keys import Keys\n", "from bs4 import BeautifulSoup\n", "import requests\n", "import time\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\nelem = browser.find_element_by_tag_name(\"body\")\\n\\nno_of_pagedowns = 20\\n\\nwhile no_of_pagedowns:\\n elem.send_keys(Keys.PAGE_DOWN)\\n time.sleep(0.2)\\n no_of_pagedowns-=1\\n'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#This is going to be the base funny twitter account. We'll first scrape other funny twitter accounts she follows.\n", "twitter_username = \"AbbieEvansXO\"\n", "driver = webdriver.Chrome('chromedriver.exe')\n", "#browser.get(\"https://twitter.com/\" + twitter_username+\"/following\")\n", "\n", "time.sleep(1)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "\n", "#log in and go to the following page for @AbbieEvansXO\n", "driver.get(\"https://www.twitter.com/login\")\n", "\n", "elem = driver.find_element_by_css_selector(\".js-initial-focus\")\n", "elem.clear()\n", "elem.send_keys('MY_EMAIL')\n", "\n", "elem = driver.find_element_by_css_selector(\".js-password-field\")\n", "elem.clear()\n", "elem.send_keys('MY_PASSWORD')\n", "\n", "elem.send_keys(Keys.RETURN)\n", "time.sleep(2)\n", "\n", "\n", "driver.get(\"https://twitter.com/\" + twitter_username+\"/following\")\n", "\n", "\n", "#Looks like the page is dynamically loaded. Keep scrolling to end of page\n", "for x in range(1, 10):\n", " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", " time.sleep(2)\n", "\n", "#Parse page using Beautifulsoup\n", "pagesrc = driver.page_source\n", "soup = BeautifulSoup(pagesrc, \"lxml\")\n", "\n", "username = []\n", "for users in soup.find_all(\"b\", class_ = \"u-linkComplex-target\"):\n", " username.append(users.text)\n", " \n", "userprofile = []\n", "for users in soup.find_all(\"a\", class_ = \"fullname ProfileNameTruncated-link u-textInheritColor js-nav\"):\n", " userprofile.append(users.text.strip())\n", "\n", "#Since I follow her myself, I'll remove my account. It shows up first in following\n", "username.pop(0)\n", "username.pop(0)\n", "\n", "userprofile.pop(0)\n", "\n", "# Create data frame:\n", "following = pd.DataFrame(list(zip(userprofile, username)), columns = [\"Profile\", \"Username\"])\n", "following[\"username_upper\"] = followers[\"Username\"].str.upper()\n", "following = followers.sort_values([\"username_upper\"])\n", "del following[\"username_upper\"]\n", "following.head()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Saving the output dataframe to CSV file. These accounts will be scraped in next script\n", "following.to_csv(\"Following list.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }