{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Final Project - James Quacinella\n", "\n", "Fo the final project, I will look at the follower network of one of the think tank Twitter account and perform clustering to find groups of associated accounts. Looking at the clusters, I hope to identify what joins them by performing some NLP tasks on the account's profile contents.\n", "\n", "## Step 1 - Crawl Twitter for Followers\n", "\n", "The next section of code does not run in the notebook, but is a copy of the crawler code created for this project. It will take a single account, get the first level followers, and then grab the 'second-level' followers. Those second level follower are only added if they were nodes in the first level (so we focus on the main account, not other accounts tangentially related)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#import graphlab as gl\n", "import pickle\n", "import twitter\n", "import logging\n", "import time\n", "from collections import defaultdict\n", "\n", "\n", "### Setup a console and file logger\n", "\n", "logger = logging.getLogger('crawler')\n", "logger.setLevel(logging.DEBUG)\n", "fh = logging.FileHandler('crawler.log')\n", "fh.setLevel(logging.INFO)\n", "ch = logging.StreamHandler()\n", "ch.setLevel(logging.INFO)\n", "formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n", "ch.setFormatter(formatter)\n", "fh.setFormatter(formatter)\n", "logger.addHandler(ch)\n", "logger.addHandler(fh)\n", "\n", "### Setup signals to make sure API calls only take 60s at most\n", "\n", "from functools import wraps\n", "import errno\n", "import os\n", "import signal\n", "\n", "class TimeoutError(Exception):\n", " pass\n", "\n", "def timeout(seconds=60, error_message=os.strerror(errno.ETIME)):\n", " def decorator(func):\n", " def _handle_timeout(signum, frame):\n", " raise TimeoutError(error_message)\n", "\n", " def wrapper(*args, **kwargs):\n", " signal.signal(signal.SIGALRM, _handle_timeout)\n", " signal.alarm(seconds)\n", " try:\n", " result = func(*args, **kwargs)\n", " finally:\n", " signal.alarm(0)\n", " return result\n", "\n", " return wraps(func)(wrapper)\n", "\n", " return decorator\n", "\n", "@timeout()\n", "def getFollowers(api, follower):\n", " ''' Function that will get a user's list of followers from an api object. \n", " NOTE: the decorator ensures that this only runs for 60s at most. '''\n", " # return api.GetFollowerIDs(follower)\n", " return api.GetFriendIDs(follower)\n", "\n", "\n", "### Twitter API\n", "\n", "# Lets create our list of api OAuth parameters\n", "API_TOKENS = [\n", " {\"consumer_key\": 'yp4wi4FASXbsRKa6JxYqzhUlH',\n", " \"consumer_secret\": 'Wkh1d5ygAOp4Bp65syFzHRN4xQsS8O4FvU3zHWosX8NXCqMpcl',\n", " \"access_token_key\": '16562593-F6lRFe7iyoQEahezhPmaI64oInHZD0LNpcIbbq7Wy',\n", " \"access_token_secret\": 'weregYL8n6DI7yZy9pkizIJ78rH2GY02Do9jvpTe7rCey',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'NsNYFG9LtZV2XMyigPaCKVyVz',\n", " \"consumer_secret\": '4J1vlowybipqXnSrKgLBvmzPmwqx71uHN32noljTgDLS2xQNfI',\n", " \"access_token_key\": '16562593-NCuQWVnpzcnB55w7VLdoCkdobdUQBRDJKjIPXAksP',\n", " \"access_token_secret\": 'nX9OksrYQxj0jBXYJTkUjlX5mZh4rZljfVRXtSM3Tjc8c',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'ZcAMGe2MUcnTO9ATCIo563SHN',\n", " \"consumer_secret\": 'dJAB7mBfoYyx27Yccbmzz98GtNigAA67Ish9Y1NjN2wNznciM1',\n", " \"access_token_key\": '16562593-AmaoKVLEYL3o8rVUS3b6u4PUbVPTI6BPsyaqCdwxY',\n", " \"access_token_secret\": '8pjYJCFWTErJlb2WSkLwsYNoptVazQQs95JAvIU8JApUA',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'avZpjObqQN9vue2Y4gu9zIF9X',\n", " \"consumer_secret\": 'Ka6WCj3fyon5yGgf5YJIIl8nVcLcUh5YT99N58qy8qv4kfaMbc',\n", " \"access_token_key\": '16562593-VNuGD09Cr29ZlzNCWnV5MOujU7PsexSwfTgfKQNqC',\n", " \"access_token_secret\": '9P3hB3qDb9zPDFCUhWU16N4CMXPwHacl6HJbCc0EuGj7s',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'sQ9H5NKteroNZSWvIrkSWvXR0',\n", " \"consumer_secret\": 'lC0ttZKdIZhhJAE1I5RxMxdjpSiADQCVUnHS7LbtfVmI2pz2F2',\n", " \"access_token_key\": '16562593-4LOk7QkXWD0boF01BmZ6NP2oPtHmDZ1OVJ883aANG',\n", " \"access_token_secret\": 'JJ85qMqzVowN1KdQ6w4YlhJB9YF9eWbw6SGbxQoU6gvne',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'DHppZ2LG3iYj8vEx7ibRRLN35',\n", " \"consumer_secret\": 'wdTQeyp7ZNDN7ne40IriRw7Ah1J8cAi2OIlw4MVtgpq5MMKjYE',\n", " \"access_token_key\": '16562593-WN8zvEWAxVfJPrneMwUjDoVQw0geuLckOOJqFimsC',\n", " \"access_token_secret\": 'ZgVi2onPB3RPGtRmPBs6QXymIMgXwJHUOQycesp64S0Hp',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'lIgtfdkC2WmN7XAcicrGygQBp',\n", " \"consumer_secret\": '2D9WIJN2MIPwFpMeIGcP6vWjQC8vvy7G5ZlHMSH1F1CsgWGKfz',\n", " \"access_token_key\": '16562593-7lhPpeZNNAGoQQJnqcnTtBiGq1O52XMZ4CMeVqXiY',\n", " \"access_token_secret\": 'WKRBQsr36MMB2EpCcZLr89ik0MSJfPoBORCKu9E1hw96I',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": '1XFu2urZzoMoC5sadXAjA7IoQ',\n", " \"consumer_secret\": 'FrJOlHfNLp3M7ejJWiO5k74E9ai6L5EzQJ45HmlsUINbh8qUUi',\n", " \"access_token_key\": '16562593-Texko6g7VyCwhNUfxBDoJKJl4058hpvQkqAYWRKpi',\n", " \"access_token_secret\": 'ISZCTvN6bYJVaJ3Z2iidQObTzE2pxkINBLi0WWe9Ab2Zv',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'r8Bvdm6I8QrRPuVzP4VtRYpqd',\n", " \"consumer_secret\": 'CzA8u8M8nDiDCCrSzCsXpR3SyTGCaLppDWbdTxSg78ZKgtKkhh',\n", " \"access_token_key\": '16562593-I3l0ZSmfZbMxIQ2NbiiM2eDMA4KNzFmFBeUkWxunR',\n", " \"access_token_secret\": '9HkILP4kSMF0hgvsB126jpoUzsRXETYMlSM0YSKb2yMJH',\n", " \"requests_timeout\": 60},\n", "\n", " {\"consumer_key\": 'NmMjfP1Zt3n2VDZ15X7SDGM6G',\n", " \"consumer_secret\": 'j9JBx7HUbMpcDnFteiIAAgHSoA8idlqQ20A1xbvnMrqMrOHQ1n',\n", " \"access_token_key\": '16562593-zUNyMUdO9JnSIstmTrqdyHHmX2lpv9NqkQxGC8faP',\n", " \"access_token_secret\": 'DEeHvLjTXlxNGmqDntXOK0cJCX08cnpg0btoRXWATW3X2',\n", " \"requests_timeout\": 60}\n", "]\n", "\n", "# Now create a list of twitter API objects\n", "apis = []\n", "for token in API_TOKENS:\n", " apis.append( twitter.Api(consumer_key=token['consumer_key'],\n", " consumer_secret=token['consumer_secret'],\n", " access_token_key=token['access_token_key'],\n", " access_token_secret=token['access_token_secret'],\n", " requests_timeout=60))\n", "\n", "\n", "# The account id / screen name we want followers from\n", "account_screen_name = 'fairmediawatch'\n", "account_id = '54679731'\n", "\n", "# Keep track of nodes connected to account, and all edges we need in the graph\n", "nodes = set()\n", "edges = defaultdict(set)\n", "\n", "\n", "# Try to load first level followers from pickle;\n", "# otherwise, generate them from a single API call and save via pickle\n", "try:\n", " logger.info(\"Loading followers for %s\" % account_screen_name)\n", " f = open(\"following1\", \"rb\")\n", " following = pickle.load(f)\n", "except Exception as e:\n", " logger.info(\"Failed. Generating followers for %s\" % account_screen_name)\n", " following = api.GetFriendIDs(screen_name=account_screen_name)\n", " pickle.dump(following, open(\"following1\", \"wb\"))\n", "\n", "# Try to load the nodes and first level edges from pickle;\n", "# otherwise generate them from the 'following' list and save\n", "try:\n", " logger.info(\"Loading nodes and edges for depth = 1, for %s\" % account_screen_name)\n", " n = open(\"nodes.follow1.set\", \"rb\")\n", " e = open(\"edges.follow1.dict\", \"rb\")\n", " nodes = pickle.load(n)\n", " edges = pickle.load(e)\n", "except Exception as e:\n", " logger.info(\"Failed. Generating nodes and edges for depth = 1, for %s\" % account_screen_name)\n", " for follower in following:\n", " nodes.add(follower)\n", " edges[account_id].add(follower)\n", " pickle.dump(nodes, open(\"nodes.follow1.set\", \"wb\"))\n", " pickle.dump(edges, open(\"edges.follow1.dict\", \"wb\"))\n", "\n", "\n", "\n", "### Crawling for Depth2\n", "\n", "\n", "# Index the api list, and start from the first api object\n", "api_idx = 0\n", "api = apis[api_idx]\n", "\n", "# Some accounts give us issues (either too many followers or no permissions)\n", "blacklist= [74323323, 43532023, 19608297, 25757924, 240369959, 173634807, 17008482, 142143804]\n", "api_updated = False\n", "\n", "# It is nice to start from a point in the list, instead of from the beginning\n", "starting_point = 142143804\n", "if starting_point:\n", " starting_point_idx = following.index(starting_point)\n", " following_iter = range(starting_point_idx, len(following))\n", "else:\n", " following_iter = range(len(following))\n", "\n", "# Try loading second layer of followers from pickle, otherwise start from scratch\n", "try:\n", " f = open(\"edges.follow2.dict\", \"rb\")\n", " edges = pickle.load(f)\n", " logger.info(\"Loaded edges.follow2 into memory!\")\n", "except Exception as e:\n", " logger.info(\"Starting from SCRATCH: did not load edges.follow2 into memory!\")\n", " pass\n", "\n", "# For each follower of the main account ...\n", "for follower_idx in following_iter:\n", " follower = following[follower_idx]\n", " success = False\n", " \n", " # ... check if they are on the blacklist; if so, skip\n", " if follower in blacklist:\n", " logger.info(\"Skipping due to blacklist\")\n", " continue\n", "\n", " # Otherwise, attempt to get list of their followers\n", " followers_depth2_list = []\n", " while not success:\n", " try:\n", " logger.info(\"Getting followers for follower %s\" % follower)\n", " followers_depth2_list = getFollowers(api, follower)\n", " success = True\n", " except TimeoutError as e:\n", " # If api call takes too long, move on\n", " logger.info(\"Timeout after 60s for follower %d\" % follower)\n", " success = True # technically not a success but setting flag so next loop moves on\n", " continue\n", " except Exception as e:\n", " # IF we get here, then we hit API limits\n", " logger.info(\"API Exception %s; api-idx = %d\" % (str(e), api_idx))\n", " \n", " # Are we at the begining of api list? \n", " # IF so, dump edges so far via pickle and sleep\n", " if api_updated and api_idx % len(API_TOKENS) == 0 and api_idx >= len(API_TOKENS):\n", " logger.info(\"Save edges to pickle file for follower = %s\" % follower)\n", " pickle.dump(edges, open(\"edges.follow2.dict\", \"wb\"))\n", " logger.info(\"Sleeping ...\")\n", " time.sleep(60)\n", " api_updated = False\n", " # Otherwise, move on to the next api object and try again\n", " else:\n", " api_idx += 1\n", " api = apis[api_idx % len(API_TOKENS)]\n", " api_updated = True\n", " \n", " \n", " # After getting the followers, find the intersection of those followers\n", " # with those of the first-level followers and add to edge dict\n", " if followers_depth2_list:\n", " logger.info(\"Adding followers to the graph\")\n", " edges[follower].update(nodes.intersection(followers_depth2_list))\n", "\n", "\n", "# Write out final list of edges via pickle\n", "logger.info(\"Save edges to pickle file for follower = %s\" % follower)\n", "pickle.dump(edges, open(\"edges.follow2.dict\", \"wb\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Instead of running the above, lets just load everything via pickle:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pickle\n", "n = open(\"nodes.follow1.set\", \"rb\")\n", "nodes = pickle.load(n)\n", "\n", "e = open(\"edges.follow2.dict\", \"rb\")\n", "edges = pickle.load(e)\n", "\n", "f = open(\"following1\", \"rb\")\n", "following = pickle.load(f)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Step 2 - Generate Graph from Crawl\n", "\n", "First, we generate CSV files so we can load data into GraphLab Create." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Hide some silly output\n", "import logging\n", "logging.getLogger(\"requests\").setLevel(logging.WARNING)\n", "logging.getLogger(\"urllib3\").setLevel(logging.WARNING)\n", "\n", "# Import everything we need\n", "import graphlab as gl\n", "\n", "# Generate CSVs from the previous crawl\n", "# TODO\n", "f = open('vertices.csv', 'w')\n", "f.write('id\\n')\n", "for node in nodes:\n", " f.write(str(node) + \"\\n\")\n", "f.close()\n", "\n", "f = open('edges.csv', 'w')\n", "f.write('src,dst,relation\\n')\n", "for node, followers in edges.iteritems():\n", " for follower in followers:\n", " f.write('%s,%s,%s\\n' % (follower, node, 'follows'))\n", "f.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, let us use these CSV files and load them into a graph object called g:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[INFO] This non-commercial license of GraphLab Create is assigned to james.quacinella@gmail.comand will expire on January 01, 2038. For commercial licensing options, visit https://dato.com/buy/.\n", "\n", "[INFO] Start server at: ipc:///tmp/graphlab_server-18863 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1437714775.log\n", "[INFO] GraphLab Server Version: 1.5.1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/vertices.csv\n", "PROGRESS: Parsing completed. Parsed 100 lines in 0.024206 secs.\n", "------------------------------------------------------\n", "Inferred types from first line of file as \n", "column_type_hints=[int]\n", "If parsing fails due to incorrect types, you can correct\n", "the inferred type list above and pass it to read_csv in\n", "the column_type_hints argument\n", "------------------------------------------------------\n", "PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/vertices.csv\n", "PROGRESS: Parsing completed. Parsed 1108 lines in 0.018389 secs.\n", "PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/edges.csv\n", "PROGRESS: Parsing completed. Parsed 100 lines in 0.114743 secs.\n", "------------------------------------------------------\n", "Inferred types from first line of file as \n", "column_type_hints=[int,int,str]\n", "If parsing fails due to incorrect types, you can correct\n", "the inferred type list above and pass it to read_csv in\n", "the column_type_hints argument\n", "------------------------------------------------------\n", "PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/edges.csv\n", "PROGRESS: Parsing completed. Parsed 105006 lines in 0.076969 secs.\n" ] } ], "source": [ "# Load Data\n", "gvertices = gl.SFrame.read_csv('vertices.csv')\n", "gedges = gl.SFrame.read_csv('edges.csv')\n", "\n", "# Create graph\n", "g = gl.SGraph()\n", "g = g.add_vertices(vertices=gvertices, vid_field='id')\n", "g = g.add_edges(edges=gedges, src_field='src', dst_field='dst')\n", "g = g.add_edges(edges=gedges, src_field='dst', dst_field='src')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets try to visualize the graph!" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Canvas is accessible via web browser at the URL: http://localhost:48677/index.html\n", "Opening Canvas in default web browser.\n" ] } ], "source": [ "# Visualize graph?\n", "gl.canvas.set_target('browser')\n", "g.show(vlabel=\"id\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks like its too large of a graph to display. \n", "\n", "## Central / Important Nodes\n", "\n", "Lets use pagerank to find important nodes in the network:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Counting out degree\n", "PROGRESS: Done counting out degree\n", "PROGRESS: +-----------+-----------------------+\n", "PROGRESS: | Iteration | L1 change in pagerank |\n", "PROGRESS: +-----------+-----------------------+\n", "PROGRESS: | 1 | 617.534 |\n", "PROGRESS: | 2 | 135.25 |\n", "PROGRESS: | 3 | 30.9247 |\n", "PROGRESS: | 4 | 8.64859 |\n", "PROGRESS: | 5 | 2.52531 |\n", "PROGRESS: | 6 | 0.885368 |\n", "PROGRESS: | 7 | 0.323184 |\n", "PROGRESS: | 8 | 0.126578 |\n", "PROGRESS: | 9 | 0.0503135 |\n", "PROGRESS: | 10 | 0.0203128 |\n", "PROGRESS: | 11 | 0.00825336 |\n", "PROGRESS: +-----------+-----------------------+\n" ] }, { "data": { "text/html": [ "
__id | \n", "pagerank | \n", "delta | \n", "
---|---|---|
54679731 | \n", "7.15893054698 | \n", "2.08163053328e-05 | \n", "
59159771 | \n", "5.73589508502 | \n", "5.1434297017e-06 | \n", "
169182727 | \n", "5.68248985863 | \n", "2.4887386834e-05 | \n", "
16935292 | \n", "4.98957011223 | \n", "3.37281975513e-05 | \n", "
1947301 | \n", "4.39339614539 | \n", "1.60673868965e-06 | \n", "
23839835 | \n", "4.36113011846 | \n", "1.93642112549e-05 | \n", "
16076032 | \n", "4.34163894719 | \n", "7.78788532063e-06 | \n", "
10117892 | \n", "3.96520683672 | \n", "4.10425955666e-06 | \n", "
16955991 | \n", "3.84512060914 | \n", "1.25983857791e-05 | \n", "
478203018 | \n", "3.40106898918 | \n", "2.55084466252e-05 | \n", "