{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%run \"libraries.ipynb\"\n", "\n", "from networkx.algorithms import bipartite \n", "\n", "sns.set(style=\"whitegrid\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pages = codecs.open(\"data/pagenames.txt\",\"r\", \"utf-8-sig\").readlines()\n", "pages = map(lambda x: x.strip(), pages)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## computing page-editor bipartite graph" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pages_editors_graph = nx.Graph()\n", "\n", "\n", "def get_editors_set(page):\n", " editors = []\n", "\n", " revisions = json.load(codecs.open(\"data/revisions/%s.json\" % (page), \"r\", \"utf-8-sig\"))\n", " \n", " revisions = pd.DataFrame(revisions)\n", " \n", " editors = revisions[revisions[\"userid\"] != 0][\"user\"].tolist()\n", " # editors = map(lambda e: u\"%s\" % (e), editors)\n", " editors = set(editors)\n", " \n", " # add the number of revisions per user\n", " revisions_count = revisions.groupby(\"user\").groups\n", " revisions_range = revisions.groupby(\"user\").agg({ \"timestamp\": [ np.min, np.max ] })\n", " # print editors\n", "\n", " # delete weird users\n", " # !!SOMEONE SHOULD CHECK THIS AGAIN: There is nan with page \"Four-dimensional space\"\n", " editors = [ e for e in editors if e in revisions_count ]\n", " \n", " # store extra information about the edge\n", " def info(x):\n", " info = {\"revisions\": len(revisions_count[x]),\n", " \"first revision\": revisions_range.ix[x][\"timestamp\"][\"amin\"],\n", " \"last revision\": revisions_range.ix[x][\"timestamp\"][\"amax\"]\n", " }\n", " return info\n", "\n", " editors = map(lambda x: (x, info(x)), editors)\n", " \n", " return editors\n", "\n", "for p in pages:\n", " title = \"p:%s\" % (p)\n", " # print p\n", " e = get_editors_set(p)\n", " \n", " node_info = {\"revisions\": int(sum([ x[1][\"revisions\"] for x in e ])),\n", " \"first revision\": str(np.min([ pd.to_datetime(x[1][\"first revision\"]) for x in e ])),\n", " \"last revision\": str(np.max([ pd.to_datetime(x[1][\"last revision\"]) for x in e ]))\n", " }\n", " \n", " pages_editors_graph.add_node(title, type=\"page\", attr_dict=node_info)\n", "\n", " # add number of revisions to users\n", " for editor in e:\n", " editor_label = \"u:%s\" % (editor[0])\n", " info = editor[1]\n", " pages_editors_graph.add_node(editor_label, type=\"user\")\n", " pages_editors_graph.add_edge(editor_label, title, attr_dict=info)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# add extra information about editors\n", "editors = [ n for n in pages_editors_graph.nodes(data=True) if n[1][\"type\"] == \"user\" ]\n", "\n", "for editor in editors:\n", " user = editor[0]\n", " pages_editors_graph.node[user][\"revisions\"] = int(np.sum([ pages_editors_graph[user][edge][\"revisions\"] for edge in pages_editors_graph[user] ]))\n", " pages_editors_graph.node[user][\"first revision\"] = str(np.min([ pd.to_datetime(pages_editors_graph[user][edge][\"first revision\"]) for edge in pages_editors_graph[user] ]))\n", " pages_editors_graph.node[user][\"last revision\"] = str(np.max([ pd.to_datetime(pages_editors_graph[user][edge][\"last revision\"]) for edge in pages_editors_graph[user] ]))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "page nodes: 303\n", "user nodes: 15857\n" ] } ], "source": [ "pages_nodes = [ x[0] for x in pages_editors_graph.nodes(data=True) if x[1][\"type\"] == \"page\" ]\n", "users_nodes = [ x[0] for x in pages_editors_graph.nodes(data=True) if x[1][\"type\"] == \"user\" ]\n", "\n", "print \"page nodes: %s\" % (len(pages_nodes))\n", "print \"user nodes: %s\" % (len(users_nodes))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### counting" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nodes: 16160\n", "edges: 39923\n" ] } ], "source": [ "print \"nodes: %s\" % (len(pages_editors_graph.nodes()))\n", "print \"edges: %s\" % (len(pages_editors_graph.edges()))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "nx.write_gexf(pages_editors_graph, \"data/pages-editors.gexf\", encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def reduce_bipartite(G, select, weight=\"weight\"):\n", " selected = [ x[0] for x in G.nodes(data=True) if x[1][\"type\"] == select ]\n", " results = bipartite.projected_graph(G, selected)\n", "\n", " for u in results.nodes():\n", " for v in results[u].keys():\n", " w = len(set(G[u]) & set(G[v]))\n", " results[u][v][weight] = w\n", "\n", " return results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## computing page-page graph" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "page_graph = reduce_bipartite(pages_editors_graph, \"page\", \"coeditors\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### counting" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nodes: 303\n", "edges: 44688\n" ] } ], "source": [ "print \"nodes: %s\" % (len(page_graph.nodes()))\n", "print \"edges: %s\" % (len(page_graph.edges()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### saving" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "nx.write_gexf(page_graph, \"data/pages-linked-by-coeditors.gexf\", encoding='utf-8')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## computing editor-editor graph" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# editors_graph = reduce_bipartite(pages_editors_graph, \"user\", \"pages\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### counting" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nodes: 15857\n", "edges: 7489562\n" ] } ], "source": [ "# print \"nodes: %s\" % (len(editors_graph.nodes()))\n", "# print \"edges: %s\" % (len(editors_graph.edges()))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# nx.write_gexf(editors_graph, \"data/wikipedia-geometry/editors-linked-by-pages.gexf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## network statistics" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2D computer graphics
2D geometric model
3D computer graphics
3D projection
3-sphere
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", "Index: [2D computer graphics, 2D geometric model, 3D computer graphics, 3D projection, 3-sphere]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "network_df = pd.DataFrame(index=pages)\n", "network_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### centrality" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
centralityclosenessbetweennesscurrent flow closenesscurrent flow betweennesseigenvector
2D computer graphics 0.970199 0.971061 0.000603 3.284879 0.002896 0.035669
2D geometric model 0.986755 0.986928 0.001794 2.618806 0.002197 0.019031
3D computer graphics 0.983444 0.983713 0.000119 3.974827 0.004239 0.058397
3D projection 0.993377 0.993421 0.000015 3.882411 0.004432 0.045345
3-sphere 0.990066 0.990164 0.000000 4.029130 0.004879 0.050949
\n", "
" ], "text/plain": [ " centrality closeness betweenness \\\n", "2D computer graphics 0.970199 0.971061 0.000603 \n", "2D geometric model 0.986755 0.986928 0.001794 \n", "3D computer graphics 0.983444 0.983713 0.000119 \n", "3D projection 0.993377 0.993421 0.000015 \n", "3-sphere 0.990066 0.990164 0.000000 \n", "\n", " current flow closeness current flow betweenness \\\n", "2D computer graphics 3.284879 0.002896 \n", "2D geometric model 2.618806 0.002197 \n", "3D computer graphics 3.974827 0.004239 \n", "3D projection 3.882411 0.004432 \n", "3-sphere 4.029130 0.004879 \n", "\n", " eigenvector \n", "2D computer graphics 0.035669 \n", "2D geometric model 0.019031 \n", "3D computer graphics 0.058397 \n", "3D projection 0.045345 \n", "3-sphere 0.050949 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "centrality = nx.degree_centrality(page_graph)\n", "closeness = nx.closeness_centrality(page_graph)\n", "betweenness = nx.betweenness_centrality(page_graph, weight=\"coeditors\")\n", "current_flow_closeness = nx.current_flow_closeness_centrality(page_graph, weight=\"coeditors\")\n", "current_flow_betweenness = nx.current_flow_betweenness_centrality(page_graph, weight=\"coeditors\")\n", "#eigenvector = nx.eigenvector_centrality(page_graph)\n", "eigenvector = nx.eigenvector_centrality_numpy(page_graph, weight=\"coeditors\")\n", "\n", "for index in network_df.index:\n", " t = \"p:%s\" % (index)\n", " \n", " network_df.ix[index,\"centrality\"] = centrality[t]\n", " network_df.ix[index,\"closeness\"] = closeness[t]\n", " network_df.ix[index,\"betweenness\"] = betweenness[t]\n", " network_df.ix[index,\"current flow closeness\"] = current_flow_closeness[t]\n", " network_df.ix[index,\"current flow betweenness\"] = current_flow_betweenness[t]\n", " network_df.ix[index,\"eigenvector\"] = eigenvector[t]\n", "\n", "network_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### exclusive editors" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "2D computer graphics 103\n", "2D geometric model 27\n", "3D computer graphics 150\n", "3D projection 77\n", "3-sphere 102\n", "Name: exclusive editors, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_exclusive_editors(title):\n", " nb = pages_editors_graph[\"p:%s\" % (title)]\n", " # print nb.keys()\n", " result = [n for n in nb.keys() if len(pages_editors_graph[n]) > 1 ]\n", " return len(result)\n", "\n", "# print network_df.index[0:10].map(get_exclusive_editors)\n", "\n", "network_df[\"exclusive editors\"] = map(get_exclusive_editors, network_df.index)\n", "\n", "network_df[\"exclusive editors\"].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storing the statistics" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "network_df.to_csv(\"data/pages-linked-by-coeditors.stats.csv\", encoding=\"UTF-8\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# final report" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.8" } }, "nbformat": 4, "nbformat_minor": 0 }