{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Mining the Social Web\n", "\n", "## Mining GitHub\n", "\n", "This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Programmatically obtaining a personal API access token for accessing GitHub's API" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import requests\n", "import json\n", "\n", "username = '' # Your GitHub username\n", "password = '' # Your GitHub password\n", "\n", "# Note that credentials will be transmitted over a secure SSL connection\n", "url = 'https://api.github.com/authorizations'\n", "note = 'Mining the Social Web - Mining Github'\n", "post_data = {'scopes':['repo'],'note': note }\n", "\n", "response = requests.post(\n", " url,\n", " auth = (username, password),\n", " data = json.dumps(post_data),\n", " ) \n", "\n", "print(\"API response:\", response.text)\n", "print()\n", "print(\"Your OAuth token is\", response.json()['token'])\n", "\n", "# Go to https://github.com/settings/tokens to revoke this token" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Making direct HTTP requests to GitHub's API" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json\n", "import requests\n", "\n", "# An unauthenticated request that doesn't contain an ?access_token=xxx query string\n", "url = \"https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers\"\n", "response = requests.get(url)\n", "\n", "# Display one stargazer\n", "print(json.dumps(response.json()[0], indent=1))\n", "print()\n", "\n", "# Display headers\n", "for (k,v) in response.headers.items():\n", " print(k, \"=>\", v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using PyGithub to query for stargazers of a particular repository" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from github import Github # pip install pygithub\n", "\n", "# XXX: Specify your own access token here\n", "\n", "ACCESS_TOKEN = ''\n", "\n", "# Specify a username and repository of interest for that user.\n", "\n", "USER = 'ptwobrussell'\n", "REPO = 'Mining-the-Social-Web'\n", "#REPO = 'Mining-the-Social-Web-2nd-Edition'\n", "\n", "client = Github(ACCESS_TOKEN, per_page=100)\n", "user = client.get_user(USER)\n", "repo = user.get_repo(REPO)\n", "\n", "# Get a list of people who have bookmarked the repo.\n", "# Since you'll get a lazy iterator back, you have to traverse\n", "# it if you want to get the total number of stargazers.\n", "\n", "stargazers = [ s for s in repo.get_stargazers() ]\n", "print(\"Number of stargazers\", len(stargazers))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Constructing a trivial property graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import networkx as nx # pip install networkx\n", "\n", "# Create a directed graph\n", "\n", "g = nx.DiGraph()\n", "\n", "# Add an edge to the directed graph from X to Y\n", "\n", "g.add_edge('X', 'Y')\n", "\n", "# Print some statistics about the graph\n", "\n", "print(nx.info(g))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get the nodes and edges from the graph\n", "\n", "print(\"Nodes:\", g.nodes())\n", "print(\"Edges:\", g.edges())\n", "print()\n", "\n", "# Get node properties\n", "\n", "print(\"X props:\", g.node['X'])\n", "print(\"Y props:\", g.node['Y'])\n", "print()\n", "\n", "# Get edge properties\n", "\n", "print(\"X=>Y props:\", g['X']['Y'])\n", "print()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Update a node property\n", "\n", "g.node['X'].update({'prop1' : 'value1'})\n", "print(\"X props:\", g.node['X'])\n", "print()\n", "\n", "# Update an edge property\n", "\n", "g['X']['Y'].update({'label' : 'label1'})\n", "print(\"X=>Y props:\", g['X']['Y'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Constructing an ego graph of a repository and its stargazers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Expand the initial graph with (interest) edges pointing each direction for \n", "# additional people interested. Take care to ensure that user and repo nodes \n", "# do not collide by appending their type.\n", "\n", "g = nx.DiGraph()\n", "g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)\n", "\n", "for sg in stargazers:\n", " g.add_node(sg.login + '(user)', type='user')\n", " g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Introducing some handy graph operations\n", "\n", "Poke around in the current graph to get a better feel for how NetworkX works." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(nx.info(g))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(g.node['Mining-the-Social-Web(repo)'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(g.node['ptwobrussell(user)'])\n", "\n", "print(g['ptwobrussell(user)']['Mining-the-Social-Web(repo)'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(g['ptwobrussell(user)'])\n", "print(g['Mining-the-Social-Web(repo)'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(g.in_edges(['ptwobrussell(user)']))\n", "print(g.out_edges(['ptwobrussell(user)']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(g.in_edges(['Mining-the-Social-Web(repo)']))\n", "print(g.out_edges(['Mining-the-Social-Web(repo)']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculating degree, betweenness, and closeness centrality measures on the Krackhardt kite graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from operator import itemgetter\n", "from IPython.display import HTML\n", "from IPython.core.display import display\n", "\n", "display(HTML(''))\n", "\n", "# The classic Krackhardt kite graph\n", "kkg = nx.generators.small.krackhardt_kite_graph()\n", "\n", "print(\"Degree Centrality\")\n", "print(sorted(nx.degree_centrality(kkg).items(), \n", " key=itemgetter(1), reverse=True))\n", "print()\n", "\n", "print(\"Betweenness Centrality\")\n", "print(sorted(nx.betweenness_centrality(kkg).items(), \n", " key=itemgetter(1), reverse=True))\n", "print()\n", "\n", "print(\"Closeness Centrality\")\n", "print(sorted(nx.closeness_centrality(kkg).items(), \n", " key=itemgetter(1), reverse=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Adding additional interest edges to the graph through the inclusion of \"follows\" edges" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Add (social) edges from the stargazers' followers. This can take a while \n", "# because of all of the potential API calls to GitHub. The approximate number\n", "# of requests for followers for each iteration of this loop can be calculated as\n", "# math.ceil(sg.get_followers() / 100.0) per the API returning up to 100 items\n", "# at a time.\n", "\n", "import sys\n", "\n", "for i, sg in enumerate(stargazers):\n", " \n", " # Add \"follows\" edges between stargazers in the graph if any relationships exist\n", " try:\n", " for follower in sg.get_followers():\n", " if follower.login + '(user)' in g:\n", " g.add_edge(follower.login + '(user)', sg.login + '(user)', \n", " type='follows')\n", " except Exception as e: #ssl.SSLError\n", " print(\"Encountered an error fetching followers for\", sg.login, \\\n", " \"Skipping.\", file=sys.stderr)\n", " print(e, file=sys.stderr)\n", "\n", " print(\"Processed\", i+1, \" stargazers. Num nodes/edges in graph\", \\\n", " g.number_of_nodes(), \"/\", g.number_of_edges())\n", " print(\"Rate limit remaining\", client.rate_limiting)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exploring the updated graph's \"follows\" edges" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from operator import itemgetter\n", "from collections import Counter\n", "\n", "# Let's see how many social edges we added since last time.\n", "print(nx.info(g))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# The number of \"follows\" edges is the difference\n", "print(len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# The repository owner is possibly one of the more popular users in this graph.\n", "print(len([e \n", " for e in g.edges_iter(data=True) \n", " if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Let's examine the number of adjacent edges to each node\n", "print(sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Consider the ratio of incoming and outgoing edges for a couple of users with \n", "# high node degrees...\n", "\n", "print(len(g.out_edges('angusshire(user)')))\n", "print(len(g.in_edges('angusshire(user)')))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# A user who is followed by many but does not follow back.\n", "\n", "print(len(g.out_edges('ptwobrussell(user)')))\n", "print(len(g.in_edges('ptwobrussell(user)')))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])\n", "popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]\n", "print(\"Number of popular users\", len(popular_users))\n", "print(\"Top 10 popular users:\", popular_users[:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Snapshotting (pickling) the graph's state to disk" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save your work by serializing out (pickling) the graph\n", "nx.write_gpickle(g, \"data/github.gpickle.1\")\n", "\n", "# How to restore the graph...\n", "# import networkx as nx\n", "# g = nx.read_gpickle(\"data/github.gpickle.1\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Applying centrality measures to the interest graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from operator import itemgetter\n", "\n", "# Create a copy of the graph so that we can iteratively mutate the copy\n", "# as needed for experimentation\n", "\n", "h = g.copy()\n", "\n", "# Remove the seed of the interest graph, which is a supernode, in order\n", "# to get a better idea of the network dynamics\n", "\n", "h.remove_node('Mining-the-Social-Web(repo)')\n", "\n", "# XXX: Remove any other nodes that appear to be supernodes.\n", "# Filter any other nodes that you can by threshold\n", "# criteria or heuristics from inspection.\n", "\n", "# Display the centrality measures for the top 10 nodes\n", "\n", "\n", "dc = sorted(nx.degree_centrality(h).items(), \n", " key=itemgetter(1), reverse=True)\n", "\n", "print(\"Degree Centrality\")\n", "print(dc[:10])\n", "print()\n", "\n", "bc = sorted(nx.betweenness_centrality(h).items(), \n", " key=itemgetter(1), reverse=True)\n", "\n", "print(\"Betweenness Centrality\")\n", "print(bc[:10])\n", "print()\n", "\n", "print(\"Closeness Centrality\")\n", "cc = sorted(nx.closeness_centrality(h).items(), \n", " key=itemgetter(1), reverse=True)\n", "print(cc[:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Adding starred repositories to the graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Let's add each stargazer's additional starred repos and add edges \n", "# to find additional interests.\n", "\n", "MAX_REPOS = 500\n", "\n", "for i, sg in enumerate(stargazers):\n", " print(sg.login)\n", " try:\n", " for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes\n", " g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \\\n", " owner=starred.owner.login)\n", " g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')\n", " except Exception as e: #ssl.SSLError:\n", " print(\"Encountered an error fetching starred repos for\", sg.login, \"Skipping.\")\n", "\n", " print(\"Processed\", i+1, \"stargazers' starred repos\")\n", " print(\"Num nodes/edges in graph\", g.number_of_nodes(), \"/\", g.number_of_edges())\n", " print(\"Rate limit\", client.rate_limiting)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**NOTE: Given that the above example is potentially a very time-consuming example to run, be sure to snapshot your work**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save your work by serializing out another snapshot of the graph\n", "nx.write_gpickle(g, \"data/github.gpickle.2\")\n", "\n", "#import networkx as nx\n", "# g = nx.read_gpickle(\"data/github.gpickle.2\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exploring the graph after updates with additional starred repositories" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Poke around: how to get users/repos\n", "from operator import itemgetter\n", "\n", "print(nx.info(g))\n", "print()\n", "\n", "# Get a list of repositories from the graph.\n", "\n", "repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']\n", "\n", "# Most popular repos\n", "\n", "print(\"Popular repositories\")\n", "print(sorted([(n,d) \n", " for (n,d) in g.in_degree_iter() \n", " if g.node[n]['type'] == 'repo'], \\\n", " key=itemgetter(1), reverse=True)[:10])\n", "print()\n", "\n", "# Projects gazed at by a user\n", "\n", "print(\"Respositories that ptwobrussell has bookmarked\")\n", "print([(n,g.node[n]['lang']) \n", " for n in g['ptwobrussell(user)'] \n", " if g['ptwobrussell(user)'][n]['type'] == 'gazes'])\n", "print()\n", "\n", "# Programming languages for each user\n", "\n", "print(\"Programming languages ptwobrussell is interested in\")\n", "print(list(set([g.node[n]['lang'] \n", " for n in g['ptwobrussell(user)'] \n", " if g['ptwobrussell(user)'][n]['type'] == 'gazes'])))\n", "print()\n", "\n", "# Find supernodes in the graph by approximating with a high number of \n", "# outgoing edges\n", "\n", "print(\"Supernode candidates\")\n", "print(sorted([(n, len(g.out_edges(n))) \n", " for n in g.nodes_iter() \n", " if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \\\n", " key=itemgetter(1), reverse=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Updating the graph to include nodes for programming languages" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Iterate over all of the repos, and add edges for programming languages \n", "# for each person in the graph. We'll also add edges back to repos so that \n", "# we have a good point to \"pivot\" upon.\n", "\n", "repos = [n \n", " for n in g.nodes_iter() \n", " if g.node[n]['type'] == 'repo']\n", "\n", "for repo in repos:\n", " lang = (g.node[repo]['lang'] or \"\") + \"(lang)\"\n", " \n", " stargazers = [u \n", " for (u, r, d) in g.in_edges_iter(repo, data=True) \n", " if d['type'] == 'gazes'\n", " ]\n", " \n", " for sg in stargazers:\n", " g.add_node(lang, type='lang')\n", " g.add_edge(sg, lang, type='programs')\n", " g.add_edge(lang, repo, type='implements')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sample queries for the final graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Some stats\n", "\n", "print(nx.info(g))\n", "print()\n", "\n", "# What languages exist in the graph?\n", "\n", "print([n \n", " for n in g.nodes_iter() \n", " if g.node[n]['type'] == 'lang'])\n", "print()\n", "\n", "# What languages do users program with?\n", "print([n \n", " for n in g['ptwobrussell(user)'] \n", " if g['ptwobrussell(user)'][n]['type'] == 'programs'])\n", "\n", "print()\n", "\n", "# What is the most popular programming language?\n", "print(\"Most popular languages\")\n", "print(sorted([(n, g.in_degree(n))\n", " for n in g.nodes_iter() \n", " if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10])\n", "print()\n", "\n", "# How many users program in a particular language?\n", "python_programmers = [u \n", " for (u, l) in g.in_edges_iter('Python(lang)') \n", " if g.node[u]['type'] == 'user']\n", "print(\"Number of Python programmers:\", len(python_programmers))\n", "print()\n", "\n", "javascript_programmers = [u for \n", " (u, l) in g.in_edges_iter('JavaScript(lang)') \n", " if g.node[u]['type'] == 'user']\n", "print(\"Number of JavaScript programmers:\", len(javascript_programmers))\n", "print()\n", "\n", "# What users program in both Python and JavaScript?\n", "print(\"Number of programmers who use JavaScript and Python\")\n", "print(len(set(python_programmers).intersection(set(javascript_programmers))))\n", "\n", "# Programmers who use JavaScript but not Python\n", "print(\"Number of programmers who use JavaScript but not Python\")\n", "print(len(set(javascript_programmers).difference(set(python_programmers))))\n", "\n", "# XXX: Can you determine who is the most polyglot programmer?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**NOTE: Optionally, snapshot the final graph**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save your work by serializing out another snapshot of the graph\n", "nx.write_gpickle(g, \"data/github.gpickle.3\")\n", "\n", "#import networkx as nx\n", "# g = nx.read_gpickle(\"data/github.gpickle.3\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graph visualization of the social network for the original interest graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(\"Stats on the full graph\")\n", "print(nx.info(g))\n", "print()\n", "\n", "# Create a subgraph from a collection of nodes. In this case, the\n", "# collection is all of the users in the original interest graph\n", "\n", "mtsw_users = [n for n in g if g.node[n]['type'] == 'user']\n", "h = g.subgraph(mtsw_users)\n", "\n", "print(\"Stats on the extracted subgraph\")\n", "print(nx.info(h))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import json\n", "from IPython.display import IFrame\n", "from IPython.core.display import display\n", "from networkx.readwrite import json_graph\n", "\n", "# Visualize the social network of all people from the original interest graph.\n", "d = json_graph.node_link_data(h)\n", "json.dump(d, open('force.json', 'w'))\n", "\n", "\n", "# IPython Notebook can serve files and display them into\n", "# inline frames. Prepend the path with the 'files' prefix.\n", "\n", "# A D3 template for displaying the graph data.\n", "viz_file = 'force.html'\n", "\n", "# Display the D3 visualization.\n", "\n", "display(IFrame(viz_file, '100%', '500px'))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Using Matplotlib and NetworkX to create graph visualizations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "%matplotlib inline\n", "\n", "\n", "fig = plt.figure(figsize=(15,15))\n", "ax = fig.add_subplot(111)\n", "labels = dict([(n, n.split('(user)')[0]) for n in h.nodes_iter()])\n", "\n", "nx.draw(h, pos=nx.spring_layout(h), \n", " arrows=False, ax=ax, node_size=50,\n", " edge_color='#aaaaaa',\n", " alpha=0.8, labels=labels, font_size=8)" ] } ], "metadata": { "kernelspec": { "display_name": "mtsw", "language": "python", "name": "mtsw" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 1 }