{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IPython Graphs \n", "\n", "## This notebook uses: Neo4j 2.1.6, Titan 0.9.0-M1(Berkeley DB) w/TP3 Gremlin Server 3.0.0-M7, ipython-cypher 0.2.1, projx 0.3.6(dev), Pandas 0.15.2, NetworkX 1.9, and gizmo 0.1.7(dev)\n", "\n", "### This notebook is NOT meant to be a benchmark!!! It's just an example that plays around with some code that I've been working on using some libraries and DBs that I like." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext cypher" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import asyncio\n", "import networkx as nx\n", "import pandas as pd\n", "import projx as px\n", "import matplotlib.pyplot as plt\n", "from datetime import datetime\n", "from gizmo import AsyncGremlinClient" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.rcParams['figure.figsize'] = 12, 7" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load a bipartite network stored as an edgelist into Neo4j with projx using a Cypher MERGE.\n", "\n", "Fire up the Neo4j server first:\n", "\n", "```bash\n", "./neo4j-community-2.1.6/bin/neo4j console\n", "```" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# The arXiv cond-mat data set: http://konect.uni-koblenz.de/networks/opsahl-collaboration\n", "arXiv_condmat_etl = {\n", " \"extractor\": {\n", " \"edgelist\": {\n", " \"filename\": \"data/opsahl-collaboration/out.opsahl-collaboration\",\n", " \"delim\": \" \",\n", " \"pattern\": [\n", " {\"node\": {\"alias\": \"n\"}},\n", " {\"edge\": {}},\n", " {\"node\": {\"alias\": \"m\"}}\n", " ]\n", " }\n", " },\n", " \"transformers\": [\n", " {\n", " \"edge\": {\n", " \"pattern\": [\n", " {\"node\": {\"alias\": \"n\", \"label\": \"Author\"}},\n", " {\"edge\": {\"label\": \"IN\"}},\n", " {\"node\": {\"alias\": \"m\", \"label\": \"Paper\"}}\n", " ]\n", " }\n", " }\n", " ],\n", " \"loader\": {\n", " \"edgelist2neo4j\": {\n", " \"uri\": \"http://localhost:7474/db/data\",\n", " \"stmt_per_req\": 500,\n", " \"req_per_tx\": 25,\n", " \"indicies\": [\n", " {\"label\": \"Author\", \"attr\": \"UniqueId\"},\n", " {\"label\": \"Paper\", \"attr\": \"UniqueId\"}\n", " ]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Statements per request: 500\n", "Requests per transactions: 25\n", "Created index: CREATE INDEX ON :Author(UniqueId);\n", "Created index: CREATE INDEX ON :Paper(UniqueId);\n", "Load complete: merged 58500 edges in 0:00:28.459678\n" ] } ], "source": [ "px.execute_etl(arXiv_condmat_etl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Not bad!** ~2000 merged edges/sec" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 rows affected.\n", "1 rows affected.\n", "1 rows affected.\n" ] } ], "source": [ "# This uses ipython-cypher by @versae.\n", "num_rels = %cypher match (auth:Author)-[rels:IN]->(pap:Paper) return count(rels)\n", "num_auths = %cypher match (auth:Author) return count(auth)\n", "num_papers = %cypher match (pap:Paper) return count(pap)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------+\n", "| count(rels) |\n", "+-------------+\n", "| 58595 |\n", "+-------------+ +-------------+\n", "| count(auth) |\n", "+-------------+\n", "| 16726 |\n", "+-------------+ +------------+\n", "| count(pap) |\n", "+------------+\n", "| 22015 |\n", "+------------+\n" ] } ], "source": [ "print(num_rels, num_auths, num_papers)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load the graph into memory for manipulation." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "neo4j2nx_etl = {\n", " \"extractor\": {\n", " \"neo4j\": {\n", " \"query\": \"match (n:Author)-[:IN]->(m:Paper) return n, m\",\n", " \"uri\": \"http://localhost:7474/db/data\"\n", " }\n", " }, \n", " \"transformers\": [\n", " {\n", " \"node\": {\n", " \"pattern\": [{\"node\": {\"alias\": \"n\", \"unique\": \"UniqueId\"}}],\n", " \"set\": [\n", " {\"key\": \"name\", \"value_lookup\": \"n.UniqueId\"},\n", " {\"key\": \"type\", \"value\": \"Author\"}\n", " ]\n", " },\n", " },\n", " {\n", " \"node\": {\n", " \"pattern\": [{\"node\": {\"alias\": \"m\", \"unique\": \"UniqueId\"}}],\n", " \"set\": [\n", " {\"key\": \"name\", \"value_lookup\": \"m.UniqueId\"},\n", " {\"key\": \"type\", \"value\": \"Paper\"}\n", " ]\n", " },\n", " },\n", " {\n", " \"edge\": {\n", " \"pattern\": [\n", " {\"node\": {\"alias\": \"n\", \"unique\": \"UniqueId\"}}, \n", " {\"edge\": {}}, \n", " {\"node\": {\"alias\": \"m\", \"unique\": \"UniqueId\"}}\n", " ] \n", " }\n", " }\n", " ], \n", " \"loader\": {\n", " \"neo4j2nx\": {}\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "graph = px.execute_etl(neo4j2nx_etl)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "num_papers = len([n for n, k in graph.nodes(data=True) if k[\"type\"] == \"Paper\"])\n", "num_authors = len([n for n, k in graph.nodes(data=True) if k[\"type\"] == \"Author\"])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "58595 16726 22015\n" ] } ], "source": [ "# Everything seems to be in order.\n", "print(len(graph.edges()), num_authors, num_papers)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Project a one mode social network using Newman's method to calculate edge weight." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "p = px.Projection(graph)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "subgraph = p.execute(\"\"\"\n", " match (a1:Author)-(p:Paper)-(a2:Author)\n", " project (a1)-(a2) method newman Paper\n", " delete p\"\"\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16264 47594\n" ] } ], "source": [ "print(len(subgraph.nodes()), len(subgraph.edges())) # ~500 authors that wrote a single paper solo." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "count 47594.000000\n", "mean 0.571679\n", "std 0.809551\n", "min 0.058824\n", "25% 0.174242\n", "50% 0.333333\n", "75% 0.500000\n", "max 22.333333\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eweight = pd.Series([a[\"weight\"] for (s, t, a) in subgraph.edges(data=True)])\n", "eweight.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Plot degree distribution of projected network." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def prob_dist(itrbl):\n", " count = {}\n", " for i in itrbl:\n", " count.setdefault(i, 0)\n", " count[i] += 1\n", " return pd.Series(count)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": [ "iVBORw0KGgoAAAANSUhEUgAAAs4AAAGvCAYAAABLrFNOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", "AAALEgAACxIB0t1+/AAAHj5JREFUeJzt3X2IZed9H/DvT5K9XckV8QzkjxphBWqTpriEFKz+sY4H\n", "bGkUTCKUlzpqE0RSq1UdZ9vuhJoaY63BJDVot+4mpE7iF9ZNKzmGGsuYohUlYwz+I7ZCsKhfsCEO\n", "sgx2smt3MVGWJH76x72buTs7s3vu3Ldz7/184KK9Z855zrPOYfTN0e/5PdVaCwAAcGO3LHoCAACw\n", "DARnAADoQHAGAIAOBGcAAOhAcAYAgA4EZwAA6EBwBgCADgRnAADoYObBuaruqKrPVdWbZn0vAACY\n", "lXm8cf6PST46h/sAAMDMjB2cq+pDVfWtqnpu3/H7q+rLVfXVqnr78Ni9Sb6Y5M+nM10AAFiMaq2N\n", "d0HV65J8L8lHWmuvGR67NclXkrwxyQtJPpfkoST/MskdSX4kyYtJHmzj3hAAAHrgtnEvaK19pqru\n", "3nf4tUm+1lr7epJU1ZNJHmitvXP4/eEkfy40AwCwrMYOzod4RZLnR75/I8k9V7+01s4fdmFVCdMA\n", "AMxFa60muXjsT5K7kzw38v1nkvzeyPdfSPKbHcdqR5lDHz9JTq/CPScd86jXj3Ndl3Oncc4i/m86\n", "i8+i/h7Tvu80xjvKGNN+Nruc59lcvvsu4nfnuNfM43fnqjybi/q7rMqzOe51c/zd2Sb532JaXTVe\n", "SHLXyPe7MnjrvG52V+Sek4551OvHua7LudM6ZxXsrsh9pzHeUcYY55qu597svHHuucx2V+i+k455\n", "lOvHvabL+dM6ZxXsrsg9Jx3zqNePc13Xc2923jj3HNvYiwOTZFjj/Mm2tzjwtgwWB74hyTeT/FGS\n", "h1prX+owVmuTvDKHGaqq062104ueB+zn2aSvPJv02aS58yjt6J5I8tkkr66q56vql1prf5PkbUme\n", "zqD93Ee7hOaRMU9X1da4c4E52F30BOAQu4ueABxid9ETgP2qaquqTk88zlHeOE+TN84AAMzD3N84\n", "AwDAOhKcAQCgg2n1cZ7IsOZkt7W2u+CpAACwYoZr6bYmHkeNMwAA60CNMwAAzIHgDAAAHahxBgBg\n", "palxBgCAMahxBgCAORCcAQCgA8EZAAA6EJwBAKADXTUAAFhpumoAAMAYdNUAAIA5EJwBAKADwRkA\n", "ADoQnAEAoANdNQAAWGm6agAAwBh01QAAgDkQnAEAoAPBGQAAOhCcAQCgA8EZAAA60I4OAICVph0d\n", "AACMQTs6AACYA8EZAAA6EJwBAKADwRkAADoQnAEAoAPBGQAAOhCcAQCgA8EZVlhVbVdtXhh8anvR\n", "8wGAZWbnQFhRg6B858eTs8cHR06eqKoHW2tPL3ZmADBfdg4Ebqhq80Jy9t7k4eGR80lOPdPaxfsW\n", "OS8AWBQ7BwIAwBz0olQDmIVLZ5KTJ5JcLdV4Mbl8ZqFTAoAlplQDVtigznljZ/Dt0hn1zQCss0lz\n", "p+AMTJ3ADkAfCc5Ar+x18zg3WiKimwcACzdp7lTjDEzZxs6gBd7Vbh45npzaSSI4A7DUdNWAA9g4\n", "BADYT6kG7KPUYDL+9wOgr9Q4w5TZOGRyFgcC0EdqnIHeGQZlYRmAlSI4w3VsHAIAXE+pBhxAqQEA\n", "rJ6VqHFO8u4ku6213YVOBgCAlVNVW0m2kjy29MHZG2fgRvwXAACmYSXeOAvOwGG0twNgWnTVAFac\n", "nQgB6Ac7BwIAQAfeOAM9pz0gAP2gxhnoPYsDAZgGiwMBAKCDSXOnGmcAAOhAcAYAgA4EZ2AtVNV2\n", "1eaFwae2Fz0fAJaPGmdg5dlEBYDEBigAHdhEBYDJKdUAAIAOvHEG1oBNVACYnBpnYC3YRAUAG6AA\n", "zIiwDbBaBGeAGdCJA2D19LqrRlX9cJJ/l2QzydOttQ/O8n4A06MTBwDXmmlXjdbal1tr/zbJzyex\n", "4QAAAEtr7OBcVR+qqm9V1XP7jt9fVV+uqq9W1dtHjv9kkk8leXLy6QLMy6Uzg/KM8xl8Tr44OAbA\n", "uhq7xrmqXpfke0k+0lp7zfDYrUm+kuSNSV5I8rkkD7XWvjRy3Sdaaw8cMJ4aZ6CXLA4EWC1zr3Fu\n", "rX2mqu7ed/i1Sb7WWvv6cFJPJnmgqn4wyU8n+XtJ/vCokwRYhGFQFpYBSDK9xYGvSPL8yPdvJLmn\n", "tfbpJJ++2cVVdXrk625rbXdK8wIAYE1V1VaSrWmNN63gPFFPu9ba6SnNAwAAkiTDl7G7V79X1WOT\n", "jDetrhovJLlr5PtdGbx1BgCAlTCt4Pz5JK+qqrur6qVJ3pzkqSmNDdB7VbVdtXlh8CntNwFW0Nil\n", "GlX1RJLXJ9msqueTvKu19uGqelsGi2huTfLB0Y4aHcY8HbXNwJLa22Xw7NVdBk9UlV0GAXpiWrXO\n", "ttwGmFDV5oXk7L17uwyeT3LqmdYu3rfIeQFwrUlz50x3DgQAgFUxra4aAGvs0pnk5IkkV0s1Xkwu\n", "22UQYMX0olQjybujxhlYYnYZBOivkRrnxyYp1ehFcFbjDADArKlxBgCAORCcAQCgA8EZYI5slAKw\n", "vHrRVcMGKMA6sFEKwGLYAAVgydgoBWCxLA4EAIA56EWpBsB6sFEKwDJTqgEwRzZKAVicSXNnL4Jz\n", "7BwIkESwBpgFOwcCrJi9rhvnRks5dN0AmJJJc6caZ4De2NgZtKq72nUjx5NTO0kEZ4Ae0FUDAAA6\n", "8MYZoDd03QDoMzXOAD1icSDA7OiqAQAAN6CrBgAAjMGW2wBrpKq2qzYvDD61vej5AKwTb5wBloQ+\n", "zwCT0ccZYG3o8wywSEo1AACgA2+cAZaGPs8Ai6TGGWCJ6PMMcHT6OAMAwA3o4wwAAGPQxxkAAOZA\n", "cAYAgA4EZwAA6EBwBgCADgRngBVRVdtVmxcGn9pe9HwAVo2uGgArYBCU7/x4cm50c5QH9XkG2KOr\n", "BgAZbIpy7njycAafc8evbpTiTTTAdPRiy+2qOh0boABM3d6b6LNX30SfqCpvooG1MrIBymTjKNUA\n", "WH6HlWoM3jqfvXfwFjpJzic59UxrF+9b1FwBFkWpBgAZvEG+/GBy6pnBR30zwLR54wywwiwaBNgz\n", "ae4UnAFW3CA8DxYKJpfOCM3AuhKcAQCgAzXOAAAwB4IzAAB0IDgDAEAHgjMAAHQgOAOsKVtxA4xH\n", "Vw2ANaS/M7COJs2dt01zMgAsi42d5Ozxva24czw5tZNEcAY4hFINAADooBdvnKvqdJLd1trugqcC\n", "sCYunUlOnkgyWqpxZqFTApiRqtpKsjXxOGqcAdbTYVtx26IbWFW23AZgaiwaBFaZxYEATJFFgwCH\n", "sTgQAAA68MYZgBEWDQIcRo0zANewOBBYVRYHAgBAB5PmTjXOAADQgeAMwE1V1XbV5oXBp7YXPR+A\n", "RVCqAcAN6e0MrAo1zgDMVNXmheTsvXu9nc8n+ZU/To5dHHy3gBBYDjZAAWDOnkty648mZ4flfidP\n", "VJU30MDKE5wBuIn9vZ0/+P3k3C12FwTWjeAMwA211p6uqgeH4ThJNpP82CLnBLAIapwBGMthiwUH\n", "f7ZxCtBfFgcCMHf7dxcc/FPnDaDfBGcAFu7gzhunnmnt4n2LnBfAqF531aiqB5K8KcmdST7YWntm\n", "lvcDAIBZmcsb56r6gSSPt9becsDPvHEGWHI2SQGWwdxLNarqQxm8Rf52a+01I8fvT/K+JLcm+UBr\n", "7b0jP3s8ye+31v7kgPEEZ4AVsL/uWWgG+mYRwfl1Sb6X5CNXg3NV3ZrkK0nemOSFJJ9L8lCSLyf5\n", "z0kutNb+zyz+AgAA0MXca5xba5+pqrv3HX5tkq+11r4+nNSTSR7IIEi/IcmdVfUPW2u/c9SJArBc\n", "Du684Y00sLymtTjwFUmeH/n+jST3tNZ+NclvTukeACyJvZrns8Oa57f++OBfOWePDb7bphtYPtMK\n", "zhOtMKyq0yNfd1truxPNBoAF29gZhOar7enefyx5NLbpBuapqraSbE1rvGkF5xeS3DXy/a4M3jp3\n", "0lo7PaV5AABAkmT4Mnb36veqemyS8aYVnD+f5FXD2udvJnlzBosDAVhLl84kJ08kGZZqfOFKcjJJ\n", "rpZqvJhcPrOgyQEcyVG6ajyR5PVJNpN8O8m7WmsfrqqfyF47ug+21n6j43gtybujRANgpVgcCPTF\n", "SMnGY7bcBgCAm5g0d94yzckAAMCqEpwBmLuq2q7avDD41Pai5wPQxbQWB05k2I5OjTPAGri+x7Oe\n", "zsBsTastnRpnAOaqavNCcvbevZ7O55Oceqa1i/ctcl7A6lPjDAAAc9CLUg0A1sn+Hs96OgPLQakG\n", "AHO3v8ez+mZgHibNnb0IzrEBCgAAM2IDFAAAGIPFgQAAMAeCMwAAdCA4AwBAB71oR2fnQAAAZsXO\n", "gQCsDO3pgHlYiXZ0gjPA+hqE5js/npwb3RDlQeEZmLZJc2cvSjUAWGcbO8nZ48nDVw8cT07tJBGc\n", "gV6xOBCAXquq7arNC4NPbS96PsD6UqoBwELdqFRDGQcwTStRqqGrBsD6GgbkB4flGUkujywOVMYB\n", "TG5aXTV6EZxba6cXPQcAFmcYlIVhYCaGL2d3q+qxScbpRXAGgINdOpOcPJFktFTjzEKnBKwtNc4A\n", "9Joez8C06OMMAAAdTJo7taMDAIAOBGcAAOhAcAYAgA560VVDH2cAAGZlWn2cLQ4EYGnosAFMQlcN\n", "AFbKYeH4+u2333oleen/TW65KEQDXQjOAKyM68PxyReTyw8OtuXevJCcvXew/fbTSX4hyePZf95i\n", "Zg4sg0lzZy9qnAFgYGMnOXt8EI6TJMeTUzu5bjvu380gNN/sPIDpEZwBWBKj229/c9GTAdaQUg0A\n", "euNGpRp7P9/YSa5sJrf+4+TcsYPOAziIGmcAVkrXzhk6bADjEpwBAKCDSXNnL3YOrKrTw8bUAHBk\n", "VbVdtXlh8KntRc8H6Ieq2hpuuDfZON44A7AKblYfDaAdHQAk6d7KDuBoelGqAQAAfeeNMwArYrTP\n", "czIs1Tiz0CkBK0WNMwBL79r+zi9JcstFLeqA/dQ4A7DW9hYFnh150/zdAzZNSYRpYBKCMwBL7vBF\n", "gQeE6hNVpdMGcCQWBwKwwjZ2Bu3pHs7g88jxZON/6PMMHIU3zgAsua6LAp9Ocj7J2c0k93r7DIzL\n", "4kAAlt5hdczXbory/iSPZq+k43ySU8+0dvG+RcwZmD+LAwFYe8OgfM2b45FOG19K/n2SW16ZZHMh\n", "EwRWguAMwMo5pNPGe5KT74w+z8ARCc4ArKADO21sJZceHHbcSHJZazpgLL0IzlV1Oslua213wVMB\n", "YIUdVNIBrL6q2kqyNfE4FgcCsGquXRSYDMsydNCANTdp7hScAVhJdgwE9hOcAQCgg0lzp50DAQCg\n", "A8EZAAA6EJwBAKADwRkAADoQnAEAoAPBGYC1VlXbVZsXBp/aXvR8gP7Sjg6AtWWjFFgvk+bOXmy5\n", "DQCLsbGTnD2ePHz1wPHk1E5syw0cQKkGABxACQewn1INANbWYaUagz8r4YBVY8ttAJjAIDxv7Ay+\n", "XTrTWnu6avNCcvbevRKO80lOPdPaxfsWNU9gcrbcBoAxjZZhJMkgEF86k2zsDI59f/P6q65sDq55\n", "2bNVL39WCQesH2+cAVgrh5RnvCe58517x956ZbB+/tyxa78/cmzw9vnxjFyrhAOWhK4aADCWAztp\n", "nNp37FjyK3+cnLo4+PrSzeR9P5Y8lUFo1oUD1tFMSzWq6oeq6gNV9bFZ3gcApu/YxdYu3jco47jl\n", "4qJnAyzeTN84t9b+NMlbBGcA+uPSmeTkiSSjpRpnk5Pv3HfszPXXPHI8+bWRsfafB6yysWucq+pD\n", "Sd6U5NuttdeMHL8/yfuS3JrkA62194787GOttZ87ZDw1zgDM1cGdNK4/dvA1VzaTl2TwFvr684D+\n", "mns7uqp6XZLvJfnI1eBcVbcm+UqSNyZ5IcnnkjzUWvvS8OeCMwAACzX3dnSttc8k+c6+w69N8rXW\n", "2tdba3+d5MkkD1TVRlW9P8mPVtXbjzpJAABYtGnVOL8iyfMj37+R5J7W2qUkj07pHgAAsDDTCs4T\n", "NYOuqtMjX3dba7sTzQYAjuhmtc7A8qiqrSRb0xpvWsH5hSR3jXy/K4O3zp201k5PaR4AcGR7m6Oc\n", "vdpd40RV2eAEltTwZezu1e9V9dgk400rOH8+yauq6u4k30zy5iQPTWlsAJiTAzdHscEJkOQIiwOr\n", "6okkn03y6qp6vqp+qbX2N0nelsEvli8m+ejVjhodxzw9fJUOAABTVVVb+0qDjzbOuO3opk07OgD6\n", "Yq9U49zoRigHlmro6wzLZ+59nKdNcAagT7osDtwL2I8cT84neXz4k8ODNrB4gjMAzFnV5oXk7L3J\n", "U0l+Kns10eeTnHqmtYv3LW52wGEmzZ3TWhw4kWHNiTZ0AABM3bTa0nnjDABjUqoBy0mpBgAsgMWB\n", "sHwEZwAA6GDS3Dl2H2cAAFhHFgcCALDSLA4EAIAxKNUAgB6oqu2qlz1btfkXVS9/drB4EFgl3jgD\n", "wIQGIfn2TyS3HxtpTXclufyALhvQHyuxAQoALLeNneTVx5JHs7eLYI4lp3aSCM6wInoRnC0OBABg\n", "ViwOBICeUKoBy8EGKADQA4PwfMevJ8demXz/z5LvvkNohn4RnAGgZ/a2406SS7vJxlaXrbmvDd9/\n", "9Z3kpZdt5Q3TIzgDQI8Mwu+dH0/OHU+eS/J7SR5Jcj4jZRwvJpcfHA3D15Z7PHzT84Hx6eMMAL2y\n", "sTMIzQ8n+dMk54b/fDyDYw9n8POrb6RHr/snwxrpLucD86arBgAAK01XDQDoIaUa0F9qnAGgZywO\n", "hH4SnAEAoAOLAwEAYA4EZwC4garartq8MPjU9qLnAyyOUg0AOMS1C/0Si/RguU2aO3vRjg4A+mlj\n", "Jzk77MmcJDmenNpJIjjDGupFcNbHGQCAWdHHGQBmTKkGrBbt6ABghvb1ZNZPGZaY4AwAAB3o4wwA\n", "AHMgOAMAQAeCMwAAdCA4A8AcDXYifNmzVZt/UXXHVwefv///qjYuV7382XntTmhHRBifxYEAMCeD\n", "gHr7J5Lbjw02Vfm94U9emuTx4Z9PXkkuPzDL7h3a7LGu7BwIAEtjYyd59bHk0SRPJfmR4fFHM7I7\n", "4bHZ705oR0Q4il4EZzsHAgAwK3YOBIAlo1QDFssGKACwRAah9Y5fT469Mvmr7wyO3vKDyUsqaV9N\n", "vvuOeQRYOyKyjgRnAADowM6BAAAwB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeCMwAAdCA4A8CMVNV2\n", "1eaFwae2J7z+HfvHOmj8Se8JHE4fZwCYgUl357v2+ucy2GXwXEbGek9y5zv3jX/QMTsCwtCkufO2\n", "aU4GALhqYyc5e3ywtXaS5HhyaidJxxA7ev3PZBCarxnr1AHjH3RsjHsCN9KL4FxVp5PsttZ2FzwV\n", "AABWTFVtJdmaeBylGgAwfUo1oH8mzZ2CMwDMyCD8buwMvl06M26A3Xf9brKxNTrWQeNPek9YZYIz\n", "AAB0MGnu1I4OAAA6EJwBAKADwRkAADoQnAEAoAPBGQAAOhCcAQCgA8EZAAA6EJwBAKADwRkAADoQ\n", "nAEAoAPBGQAAOhCcAQCgA8EZAAA6EJwBAKADwRkAADq4bZaDV9UdSX47yZUku621/znL+wEAwKzM\n", "+o3zTyf5g9bav07yUzO+FwAAzMzYwbmqPlRV36qq5/Ydv7+qvlxVX62qtw8PvyLJ88M//+2EcwWA\n", "tVZV21WbFwaf2r7Rz2507s3GHBx72bNVm39R9fJnD7r+RufsjfmyZwc/u/EcYFlUa228C6pel+R7\n", "ST7SWnvN8NitSb6S5I1JXkjyuSQPJfmnSb7TWvtUVT3RWnvogPFaa60m+2sAwGobBM87P56cOz44\n", "cvLF5PKDrbWnr//ZW68MqjHPHdt/bocx35Pc/q7k9mPJ48MzT15JLj9w9frBdbd/4qBzBn++8+PJ\n", "I8eT8xn5+YFzgHmaNHeOXePcWvtMVd297/Brk3yttfb14aSeTPJAknNJfquq3pTkqaNOEgDY2EnO\n", "Hk8evnrgeHJqJ8nT1//s/ceSR3PwuTcd81Ty6v3XH7v2+o2dG5yTwZhPZRCabzYHWB7TWhw4WpKR\n", "JN9Ick9r7S+T/PLNLq6q0yNfd1tru1OaFwAAa6qqtpJsTWu8aQXn8eo99l/c2ukpzQMAVtSlM8nJ\n", "E0lGyyrOHPyzL1xJTibJsevPvemYZ5MvvCv5tWN75528cu31l84kX/jxw885eWJQqvFrI/c6bA4w\n", "O8OXsbtXv1fVY5OMN3aN8/Cmdyf55EiN8z9Lcrq1dv/w+39K8v3W2ns7jKXGGQA6GNQWbwzLIS6d\n", "Ga0X3v+zwT8PPvdmYw6O3fHrybFXJt//s+S77zi4Pvrgc/bGvLKZvCTJLRdvNAeYl0lz57SC820Z\n", "LA58Q5JvJvmjJA+11r7UYayW5N1RogEAwAyMlGw8NtfgXFVPJHl9ks0k307yrtbah6vqJ5K8L8mt\n", "ST7YWvuNjuN54wwAwMwt5I3zNAnOAADMw6S5c9Y7BwIAwEqYVleNiQzb0alxBgBg6qbVlk6pBgAA\n", "a0GpBgAAzIHgDAAAHQjOAADQgcWBAACsNIsDAQBgDBYHAgDAHAjOAADQgeAMAAAdWBwIAMBKszgQ\n", "AADGYHEgAADMgeAMAAAdCM4AANCB4AwAAB3oqgEAwErTVQMAAMagqwYAAMyB4AwAAB0IzgAA0IHg\n", "DAAAHQjOAADQgeAMAAAd6OMMAMBK08cZAADGoI8zAADMgeAMAAAdCM4AANCB4AwAAB0IzgAA0IHg\n", "DAAAHQjOAADQgeAMAAAd2DkQAICVZudAAAAYg50DAQBgDgRnAADoQHAGAIAOBGcAAOhAcAYAgA4E\n", "ZwAA6EBwBgCADgRnAADoQHAGAIAOBGcAAOhAcAYAgA5uW/QEkqSqTifZba3tLngqAACsmKraSrI1\n", "8TittYknM9EEqlprrRY6CQAAVt6kuVOpBgAAdCA4AwBAB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeC\n", "MwAAdCA4AwBAB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeCMwAAdCA4AwBAB4IzAAB0IDgDAEAHgjMA\n", "AHQw0+BcVT9UVR+oqo/N8j4AADBrMw3OrbU/ba29ZZb3gFmqqq1FzwEO4tmkrzybrLJOwbmqPlRV\n", "36qq5/Ydv7+qvlxVX62qt89mirBQW4ueABxia9ETgENsLXoCMCtd3zh/OMn9oweq6tYkvzU8/iNJ\n", "Hqqqf1RVv1hV/6Wq/sF0p9p/i/j/smdxz0nHPOr141zX5dxpnbMKFvX3nPZ9pzHeUcaY9rPZ5TzP\n", "5vLddxG/O8e9xu/O8fj3+mTXr+Lvzk7BubX2mSTf2Xf4tUm+1lr7emvtr5M8meSB1tp/b639h9ba\n", "N6tqo6ren+RH1+SN9NaK3HPSMY96/TjXdTl3Wuesgq0Vue80xjvKGONc0/Xcm503zj2X2dYK3XfS\n", "MY9y/bjXdDl/Wuesgq0VueekYx71+nGu63ruzc4b555jq9ZatxOr7k7yydbaa4bffzbJdmvtkeH3\n", "X0hyT2vtV8eaQFW3CQAAwIRaa3XUa2+b5L4TXLs3yASTBwCAeZmkq8YLSe4a+X5Xkm9MNh0AAOin\n", "SYLz55O8qqrurqqXJnlzkqemMy0AAOiXru3onkjy2SSvrqrnq+qXWmt/k+RtSZ5O8sUkH22tfWl2\n", "UwUAgMXpvDgQAADW2Ux3DpyE7brpm6q6o6rOV9XvVtW/WPR8YJTfmfRVVT0w/L35ZFXdu+j5wKiq\n", "+uGq+m9V9QdV9a9uen7f3zhX1cdaaz+36HlAVf1ikkuttU9V1ZOttZ9f9JxgP78z6auq+oEkj7fW\n", "3rLoucB+VXVLkidba//8RufN/I2z7brpszGfz1ckeX7457+d60RZS35/0ldHfDbfmcGOwzBT4z6f\n", "VfWTST6VwWZ+NzSPUg3bddNnnZ/PDNotXm3B2NsyJ1bKOM8nzNM4/26vqnpvkv/dWvuT+U+VNTTW\n", "787W2idbaz+R5OGbDTzzf/nbrps+G+f5TPK/kvxMVf12tF5kDsZ5Pv3OZJ7G/N35tiRvSPKzVfVv\n", "5jtT1tGYvztfX1X/tap+J8kf3mzsSXYOnMTof/JOBm/y7hk9obV2Kcmj85wUDB34fLbW/jLJLy9m\n", "SvB3Dns+/c5k0Q57Nn81yW8uZkrwdw57Pj+d5NNdB1nUf27u94pE1p3nkz7zfNJXnk36bCrP56KC\n", "s+266TPPJ33m+aSvPJv02VSez0UFZ9t102eeT/rM80lfeTbps6k8n/NoR2e7bnrL80mfeT7pK88m\n", "fTbL57P3G6AAAEAf6EULAAAdCM4AANCB4AwAAB0IzgAA0IHgDAAAHQjOAADQgeAMAAAdCM4AANCB\n", "4AwAAB38f9CGVdgyBN6eAAAAAElFTkSuQmCC\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "deg_dist = prob_dist(nx.degree(subgraph).values())\n", "plt.scatter(deg_dist.index, deg_dist)\n", "plt.xscale(\"log\")\n", "plt.yscale(\"log\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Write to txtfile\n", "with open('output.txt', 'w') as f:\n", " for s, t in subgraph.edges():\n", " f.write(str(s) + '\\t' + str(t) + '\\n')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "#### Load the social network into Titan using the gizmo driver.\n", "\n", "**Note:** This is just a quick demo to try out the driver, I've made no attempt to do any sort of batch loading/server tuning.\n", "\n", "You can get the Titan 0.0.9 [here](https://groups.google.com/forum/#!msg/aureliusgraphs/_onvDrvBEwk/lHCNMqefzacJ). It comes packaged with the TP3 Gremlin Server, unpack and...\n", "\n", "Fire up the Gremlin Server:\n", "\n", "```bash\n", "./bin/gremlin-server.sh \n", "```" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def build_schema(gc):\n", " script = \"\"\"\n", " mgmt = g.openManagement();\n", " uniqueId = mgmt.makePropertyKey('uniqueId').dataType(Integer.class).make();\n", " mgmt.buildIndex('byId', Vertex.class).addKey(uniqueId).unique().buildCompositeIndex();\n", " collabs=mgmt.makeEdgeLabel('collabs').make();\n", " mgmt.commit();\"\"\"\n", " task = gc.task(gc.submit, script,\n", " consumer=lambda x: print(\"Commited tx with response code: {}\".format(x.status_code)))\n", " gc.run_until_complete(task)\n", " \n", "\n", "def load_edges(gc):\n", " start = datetime.now()\n", " script = \"\"\" \n", " getOrCreate = { id ->\n", " def n = g.V().has('uniqueId', id)\n", " if (n.hasNext()) {n.next()} else {g.addVertex(\"uniqueId\", id)}\n", " }\n", "\n", " new File('output.txt').eachLine { \n", " (fromVertex, toVertex) = it.split('\\t').collect(getOrCreate)\n", " fromVertex.addEdge('collabs', toVertex)\n", " }\n", "\n", " g.tx().commit()\"\"\"\n", " task = gc.task(gc.submit, script,\n", " consumer=lambda x: print(\"Commited tx with response code: {}\".format(x.status_code)))\n", " gc.run_until_complete(task)\n", " print(\"Loaded in {}\".format(datetime.now() - start))\n", "\n", " \n", "@asyncio.coroutine\n", "def count_nodes(gc):\n", " yield from gc.submit(\"g.V().count()\", collect=False, consumer=lambda x: print(x))\n", "\n", " \n", "@asyncio.coroutine\n", "def count_edges(gc):\n", " yield from gc.submit(\"g.E().count()\", collect=False, consumer=lambda x: print(x))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "gc = AsyncGremlinClient()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Commited tx with response code: 200\n" ] } ], "source": [ "build_schema(gc)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Commited tx with response code: 200\n", "Loaded in 0:00:16.713325\n" ] } ], "source": [ "load_edges(gc)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[16264]\n" ] } ], "source": [ "gc.run_until_complete(count_nodes(gc))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[47594]\n" ] } ], "source": [ "gc.run_until_complete(count_edges(gc))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "####(I'm sure you can do that much faster with some config tricks, I'm not exactly a Gremlin expert either...)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }