{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IPython Graphs \n",
"\n",
"## This notebook uses: Neo4j 2.1.6, Titan 0.9.0-M1(Berkeley DB) w/TP3 Gremlin Server 3.0.0-M7, ipython-cypher 0.2.1, projx 0.3.6(dev), Pandas 0.15.2, NetworkX 1.9, and gizmo 0.1.7(dev)\n",
"\n",
"### This notebook is NOT meant to be a benchmark!!! It's just an example that plays around with some code that I've been working on using some libraries and DBs that I like."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"%load_ext cypher"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import asyncio\n",
"import networkx as nx\n",
"import pandas as pd\n",
"import projx as px\n",
"import matplotlib.pyplot as plt\n",
"from datetime import datetime\n",
"from gizmo import AsyncGremlinClient"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize'] = 12, 7"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load a bipartite network stored as an edgelist into Neo4j with projx using a Cypher MERGE.\n",
"\n",
"Fire up the Neo4j server first:\n",
"\n",
"```bash\n",
"./neo4j-community-2.1.6/bin/neo4j console\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# The arXiv cond-mat data set: http://konect.uni-koblenz.de/networks/opsahl-collaboration\n",
"arXiv_condmat_etl = {\n",
" \"extractor\": {\n",
" \"edgelist\": {\n",
" \"filename\": \"data/opsahl-collaboration/out.opsahl-collaboration\",\n",
" \"delim\": \" \",\n",
" \"pattern\": [\n",
" {\"node\": {\"alias\": \"n\"}},\n",
" {\"edge\": {}},\n",
" {\"node\": {\"alias\": \"m\"}}\n",
" ]\n",
" }\n",
" },\n",
" \"transformers\": [\n",
" {\n",
" \"edge\": {\n",
" \"pattern\": [\n",
" {\"node\": {\"alias\": \"n\", \"label\": \"Author\"}},\n",
" {\"edge\": {\"label\": \"IN\"}},\n",
" {\"node\": {\"alias\": \"m\", \"label\": \"Paper\"}}\n",
" ]\n",
" }\n",
" }\n",
" ],\n",
" \"loader\": {\n",
" \"edgelist2neo4j\": {\n",
" \"uri\": \"http://localhost:7474/db/data\",\n",
" \"stmt_per_req\": 500,\n",
" \"req_per_tx\": 25,\n",
" \"indicies\": [\n",
" {\"label\": \"Author\", \"attr\": \"UniqueId\"},\n",
" {\"label\": \"Paper\", \"attr\": \"UniqueId\"}\n",
" ]\n",
" }\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statements per request: 500\n",
"Requests per transactions: 25\n",
"Created index: CREATE INDEX ON :Author(UniqueId);\n",
"Created index: CREATE INDEX ON :Paper(UniqueId);\n",
"Load complete: merged 58500 edges in 0:00:28.459678\n"
]
}
],
"source": [
"px.execute_etl(arXiv_condmat_etl)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Not bad!** ~2000 merged edges/sec"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n",
"1 rows affected.\n",
"1 rows affected.\n"
]
}
],
"source": [
"# This uses ipython-cypher by @versae.\n",
"num_rels = %cypher match (auth:Author)-[rels:IN]->(pap:Paper) return count(rels)\n",
"num_auths = %cypher match (auth:Author) return count(auth)\n",
"num_papers = %cypher match (pap:Paper) return count(pap)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+\n",
"| count(rels) |\n",
"+-------------+\n",
"| 58595 |\n",
"+-------------+ +-------------+\n",
"| count(auth) |\n",
"+-------------+\n",
"| 16726 |\n",
"+-------------+ +------------+\n",
"| count(pap) |\n",
"+------------+\n",
"| 22015 |\n",
"+------------+\n"
]
}
],
"source": [
"print(num_rels, num_auths, num_papers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load the graph into memory for manipulation."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"neo4j2nx_etl = {\n",
" \"extractor\": {\n",
" \"neo4j\": {\n",
" \"query\": \"match (n:Author)-[:IN]->(m:Paper) return n, m\",\n",
" \"uri\": \"http://localhost:7474/db/data\"\n",
" }\n",
" }, \n",
" \"transformers\": [\n",
" {\n",
" \"node\": {\n",
" \"pattern\": [{\"node\": {\"alias\": \"n\", \"unique\": \"UniqueId\"}}],\n",
" \"set\": [\n",
" {\"key\": \"name\", \"value_lookup\": \"n.UniqueId\"},\n",
" {\"key\": \"type\", \"value\": \"Author\"}\n",
" ]\n",
" },\n",
" },\n",
" {\n",
" \"node\": {\n",
" \"pattern\": [{\"node\": {\"alias\": \"m\", \"unique\": \"UniqueId\"}}],\n",
" \"set\": [\n",
" {\"key\": \"name\", \"value_lookup\": \"m.UniqueId\"},\n",
" {\"key\": \"type\", \"value\": \"Paper\"}\n",
" ]\n",
" },\n",
" },\n",
" {\n",
" \"edge\": {\n",
" \"pattern\": [\n",
" {\"node\": {\"alias\": \"n\", \"unique\": \"UniqueId\"}}, \n",
" {\"edge\": {}}, \n",
" {\"node\": {\"alias\": \"m\", \"unique\": \"UniqueId\"}}\n",
" ] \n",
" }\n",
" }\n",
" ], \n",
" \"loader\": {\n",
" \"neo4j2nx\": {}\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"graph = px.execute_etl(neo4j2nx_etl)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"num_papers = len([n for n, k in graph.nodes(data=True) if k[\"type\"] == \"Paper\"])\n",
"num_authors = len([n for n, k in graph.nodes(data=True) if k[\"type\"] == \"Author\"])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"58595 16726 22015\n"
]
}
],
"source": [
"# Everything seems to be in order.\n",
"print(len(graph.edges()), num_authors, num_papers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Project a one mode social network using Newman's method to calculate edge weight."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"p = px.Projection(graph)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subgraph = p.execute(\"\"\"\n",
" match (a1:Author)-(p:Paper)-(a2:Author)\n",
" project (a1)-(a2) method newman Paper\n",
" delete p\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16264 47594\n"
]
}
],
"source": [
"print(len(subgraph.nodes()), len(subgraph.edges())) # ~500 authors that wrote a single paper solo."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"count 47594.000000\n",
"mean 0.571679\n",
"std 0.809551\n",
"min 0.058824\n",
"25% 0.174242\n",
"50% 0.333333\n",
"75% 0.500000\n",
"max 22.333333\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eweight = pd.Series([a[\"weight\"] for (s, t, a) in subgraph.edges(data=True)])\n",
"eweight.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Plot degree distribution of projected network."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def prob_dist(itrbl):\n",
" count = {}\n",
" for i in itrbl:\n",
" count.setdefault(i, 0)\n",
" count[i] += 1\n",
" return pd.Series(count)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": [
"iVBORw0KGgoAAAANSUhEUgAAAs4AAAGvCAYAAABLrFNOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n",
"AAALEgAACxIB0t1+/AAAHj5JREFUeJzt3X2IZed9H/DvT5K9XckV8QzkjxphBWqTpriEFKz+sY4H\n",
"bGkUTCKUlzpqE0RSq1UdZ9vuhJoaY63BJDVot+4mpE7iF9ZNKzmGGsuYohUlYwz+I7ZCsKhfsCEO\n",
"sgx2smt3MVGWJH76x72buTs7s3vu3Ldz7/184KK9Z855zrPOYfTN0e/5PdVaCwAAcGO3LHoCAACw\n",
"DARnAADoQHAGAIAOBGcAAOhAcAYAgA4EZwAA6EBwBgCADgRnAADoYObBuaruqKrPVdWbZn0vAACY\n",
"lXm8cf6PST46h/sAAMDMjB2cq+pDVfWtqnpu3/H7q+rLVfXVqnr78Ni9Sb6Y5M+nM10AAFiMaq2N\n",
"d0HV65J8L8lHWmuvGR67NclXkrwxyQtJPpfkoST/MskdSX4kyYtJHmzj3hAAAHrgtnEvaK19pqru\n",
"3nf4tUm+1lr7epJU1ZNJHmitvXP4/eEkfy40AwCwrMYOzod4RZLnR75/I8k9V7+01s4fdmFVCdMA\n",
"AMxFa60muXjsT5K7kzw38v1nkvzeyPdfSPKbHcdqR5lDHz9JTq/CPScd86jXj3Ndl3Oncc4i/m86\n",
"i8+i/h7Tvu80xjvKGNN+Nruc59lcvvsu4nfnuNfM43fnqjybi/q7rMqzOe51c/zd2Sb532JaXTVe\n",
"SHLXyPe7MnjrvG52V+Sek4551OvHua7LudM6ZxXsrsh9pzHeUcYY55qu597svHHuucx2V+i+k455\n",
"lOvHvabL+dM6ZxXsrsg9Jx3zqNePc13Xc2923jj3HNvYiwOTZFjj/Mm2tzjwtgwWB74hyTeT/FGS\n",
"h1prX+owVmuTvDKHGaqq062104ueB+zn2aSvPJv02aS58yjt6J5I8tkkr66q56vql1prf5PkbUme\n",
"zqD93Ee7hOaRMU9X1da4c4E52F30BOAQu4ueABxid9ETgP2qaquqTk88zlHeOE+TN84AAMzD3N84\n",
"AwDAOhKcAQCgg2n1cZ7IsOZkt7W2u+CpAACwYoZr6bYmHkeNMwAA60CNMwAAzIHgDAAAHahxBgBg\n",
"palxBgCAMahxBgCAORCcAQCgA8EZAAA6EJwBAKADXTUAAFhpumoAAMAYdNUAAIA5EJwBAKADwRkA\n",
"ADoQnAEAoANdNQAAWGm6agAAwBh01QAAgDkQnAEAoAPBGQAAOhCcAQCgA8EZAAA60I4OAICVph0d\n",
"AACMQTs6AACYA8EZAAA6EJwBAKADwRkAADoQnAEAoAPBGQAAOhCcAQCgA8EZVlhVbVdtXhh8anvR\n",
"8wGAZWbnQFhRg6B858eTs8cHR06eqKoHW2tPL3ZmADBfdg4Ebqhq80Jy9t7k4eGR80lOPdPaxfsW\n",
"OS8AWBQ7BwIAwBz0olQDmIVLZ5KTJ5JcLdV4Mbl8ZqFTAoAlplQDVtigznljZ/Dt0hn1zQCss0lz\n",
"p+AMTJ3ADkAfCc5Ar+x18zg3WiKimwcACzdp7lTjDEzZxs6gBd7Vbh45npzaSSI4A7DUdNWAA9g4\n",
"BADYT6kG7KPUYDL+9wOgr9Q4w5TZOGRyFgcC0EdqnIHeGQZlYRmAlSI4w3VsHAIAXE+pBhxAqQEA\n",
"rJ6VqHFO8u4ku6213YVOBgCAlVNVW0m2kjy29MHZG2fgRvwXAACmYSXeOAvOwGG0twNgWnTVAFac\n",
"nQgB6Ac7BwIAQAfeOAM9pz0gAP2gxhnoPYsDAZgGiwMBAKCDSXOnGmcAAOhAcAYAgA4EZ2AtVNV2\n",
"1eaFwae2Fz0fAJaPGmdg5dlEBYDEBigAHdhEBYDJKdUAAIAOvHEG1oBNVACYnBpnYC3YRAUAG6AA\n",
"zIiwDbBaBGeAGdCJA2D19LqrRlX9cJJ/l2QzydOttQ/O8n4A06MTBwDXmmlXjdbal1tr/zbJzyex\n",
"4QAAAEtr7OBcVR+qqm9V1XP7jt9fVV+uqq9W1dtHjv9kkk8leXLy6QLMy6Uzg/KM8xl8Tr44OAbA\n",
"uhq7xrmqXpfke0k+0lp7zfDYrUm+kuSNSV5I8rkkD7XWvjRy3Sdaaw8cMJ4aZ6CXLA4EWC1zr3Fu\n",
"rX2mqu7ed/i1Sb7WWvv6cFJPJnmgqn4wyU8n+XtJ/vCokwRYhGFQFpYBSDK9xYGvSPL8yPdvJLmn\n",
"tfbpJJ++2cVVdXrk625rbXdK8wIAYE1V1VaSrWmNN63gPFFPu9ba6SnNAwAAkiTDl7G7V79X1WOT\n",
"jDetrhovJLlr5PtdGbx1BgCAlTCt4Pz5JK+qqrur6qVJ3pzkqSmNDdB7VbVdtXlh8CntNwFW0Nil\n",
"GlX1RJLXJ9msqueTvKu19uGqelsGi2huTfLB0Y4aHcY8HbXNwJLa22Xw7NVdBk9UlV0GAXpiWrXO\n",
"ttwGmFDV5oXk7L17uwyeT3LqmdYu3rfIeQFwrUlz50x3DgQAgFUxra4aAGvs0pnk5IkkV0s1Xkwu\n",
"22UQYMX0olQjybujxhlYYnYZBOivkRrnxyYp1ehFcFbjDADArKlxBgCAORCcAQCgA8EZYI5slAKw\n",
"vHrRVcMGKMA6sFEKwGLYAAVgydgoBWCxLA4EAIA56EWpBsB6sFEKwDJTqgEwRzZKAVicSXNnL4Jz\n",
"7BwIkESwBpgFOwcCrJi9rhvnRks5dN0AmJJJc6caZ4De2NgZtKq72nUjx5NTO0kEZ4Ae0FUDAAA6\n",
"8MYZoDd03QDoMzXOAD1icSDA7OiqAQAAN6CrBgAAjMGW2wBrpKq2qzYvDD61vej5AKwTb5wBloQ+\n",
"zwCT0ccZYG3o8wywSEo1AACgA2+cAZaGPs8Ai6TGGWCJ6PMMcHT6OAMAwA3o4wwAAGPQxxkAAOZA\n",
"cAYAgA4EZwAA6EBwBgCADgRngBVRVdtVmxcGn9pe9HwAVo2uGgArYBCU7/x4cm50c5QH9XkG2KOr\n",
"BgAZbIpy7njycAafc8evbpTiTTTAdPRiy+2qOh0boABM3d6b6LNX30SfqCpvooG1MrIBymTjKNUA\n",
"WH6HlWoM3jqfvXfwFjpJzic59UxrF+9b1FwBFkWpBgAZvEG+/GBy6pnBR30zwLR54wywwiwaBNgz\n",
"ae4UnAFW3CA8DxYKJpfOCM3AuhKcAQCgAzXOAAAwB4IzAAB0IDgDAEAHgjMAAHQgOAOsKVtxA4xH\n",
"Vw2ANaS/M7COJs2dt01zMgAsi42d5Ozxva24czw5tZNEcAY4hFINAADooBdvnKvqdJLd1trugqcC\n",
"sCYunUlOnkgyWqpxZqFTApiRqtpKsjXxOGqcAdbTYVtx26IbWFW23AZgaiwaBFaZxYEATJFFgwCH\n",
"sTgQAAA68MYZgBEWDQIcRo0zANewOBBYVRYHAgBAB5PmTjXOAADQgeAMwE1V1XbV5oXBp7YXPR+A\n",
"RVCqAcAN6e0MrAo1zgDMVNXmheTsvXu9nc8n+ZU/To5dHHy3gBBYDjZAAWDOnkty648mZ4flfidP\n",
"VJU30MDKE5wBuIn9vZ0/+P3k3C12FwTWjeAMwA211p6uqgeH4ThJNpP82CLnBLAIapwBGMthiwUH\n",
"f7ZxCtBfFgcCMHf7dxcc/FPnDaDfBGcAFu7gzhunnmnt4n2LnBfAqF531aiqB5K8KcmdST7YWntm\n",
"lvcDAIBZmcsb56r6gSSPt9becsDPvHEGWHI2SQGWwdxLNarqQxm8Rf52a+01I8fvT/K+JLcm+UBr\n",
"7b0jP3s8ye+31v7kgPEEZ4AVsL/uWWgG+mYRwfl1Sb6X5CNXg3NV3ZrkK0nemOSFJJ9L8lCSLyf5\n",
"z0kutNb+zyz+AgAA0MXca5xba5+pqrv3HX5tkq+11r4+nNSTSR7IIEi/IcmdVfUPW2u/c9SJArBc\n",
"Du684Y00sLymtTjwFUmeH/n+jST3tNZ+NclvTukeACyJvZrns8Oa57f++OBfOWePDb7bphtYPtMK\n",
"zhOtMKyq0yNfd1truxPNBoAF29gZhOar7enefyx5NLbpBuapqraSbE1rvGkF5xeS3DXy/a4M3jp3\n",
"0lo7PaV5AABAkmT4Mnb36veqemyS8aYVnD+f5FXD2udvJnlzBosDAVhLl84kJ08kGZZqfOFKcjJJ\n",
"rpZqvJhcPrOgyQEcyVG6ajyR5PVJNpN8O8m7WmsfrqqfyF47ug+21n6j43gtybujRANgpVgcCPTF\n",
"SMnGY7bcBgCAm5g0d94yzckAAMCqEpwBmLuq2q7avDD41Pai5wPQxbQWB05k2I5OjTPAGri+x7Oe\n",
"zsBsTastnRpnAOaqavNCcvbevZ7O55Oceqa1i/ctcl7A6lPjDAAAc9CLUg0A1sn+Hs96OgPLQakG\n",
"AHO3v8ez+mZgHibNnb0IzrEBCgAAM2IDFAAAGIPFgQAAMAeCMwAAdCA4AwBAB71oR2fnQAAAZsXO\n",
"gQCsDO3pgHlYiXZ0gjPA+hqE5js/npwb3RDlQeEZmLZJc2cvSjUAWGcbO8nZ48nDVw8cT07tJBGc\n",
"gV6xOBCAXquq7arNC4NPbS96PsD6UqoBwELdqFRDGQcwTStRqqGrBsD6GgbkB4flGUkujywOVMYB\n",
"TG5aXTV6EZxba6cXPQcAFmcYlIVhYCaGL2d3q+qxScbpRXAGgINdOpOcPJFktFTjzEKnBKwtNc4A\n",
"9Joez8C06OMMAAAdTJo7taMDAIAOBGcAAOhAcAYAgA560VVDH2cAAGZlWn2cLQ4EYGnosAFMQlcN\n",
"AFbKYeH4+u2333oleen/TW65KEQDXQjOAKyM68PxyReTyw8OtuXevJCcvXew/fbTSX4hyePZf95i\n",
"Zg4sg0lzZy9qnAFgYGMnOXt8EI6TJMeTUzu5bjvu380gNN/sPIDpEZwBWBKj229/c9GTAdaQUg0A\n",
"euNGpRp7P9/YSa5sJrf+4+TcsYPOAziIGmcAVkrXzhk6bADjEpwBAKCDSXNnL3YOrKrTw8bUAHBk\n",
"VbVdtXlh8KntRc8H6Ieq2hpuuDfZON44A7AKblYfDaAdHQAk6d7KDuBoelGqAQAAfeeNMwArYrTP\n",
"czIs1Tiz0CkBK0WNMwBL79r+zi9JcstFLeqA/dQ4A7DW9hYFnh150/zdAzZNSYRpYBKCMwBL7vBF\n",
"gQeE6hNVpdMGcCQWBwKwwjZ2Bu3pHs7g88jxZON/6PMMHIU3zgAsua6LAp9Ocj7J2c0k93r7DIzL\n",
"4kAAlt5hdczXbory/iSPZq+k43ySU8+0dvG+RcwZmD+LAwFYe8OgfM2b45FOG19K/n2SW16ZZHMh\n",
"EwRWguAMwMo5pNPGe5KT74w+z8ARCc4ArKADO21sJZceHHbcSHJZazpgLL0IzlV1Oslua213wVMB\n",
"YIUdVNIBrL6q2kqyNfE4FgcCsGquXRSYDMsydNCANTdp7hScAVhJdgwE9hOcAQCgg0lzp50DAQCg\n",
"A8EZAAA6EJwBAKADwRkAADoQnAEAoAPBGYC1VlXbVZsXBp/aXvR8gP7Sjg6AtWWjFFgvk+bOXmy5\n",
"DQCLsbGTnD2ePHz1wPHk1E5syw0cQKkGABxACQewn1INANbWYaUagz8r4YBVY8ttAJjAIDxv7Ay+\n",
"XTrTWnu6avNCcvbevRKO80lOPdPaxfsWNU9gcrbcBoAxjZZhJMkgEF86k2zsDI59f/P6q65sDq55\n",
"2bNVL39WCQesH2+cAVgrh5RnvCe58517x956ZbB+/tyxa78/cmzw9vnxjFyrhAOWhK4aADCWAztp\n",
"nNp37FjyK3+cnLo4+PrSzeR9P5Y8lUFo1oUD1tFMSzWq6oeq6gNV9bFZ3gcApu/YxdYu3jco47jl\n",
"4qJnAyzeTN84t9b+NMlbBGcA+uPSmeTkiSSjpRpnk5Pv3HfszPXXPHI8+bWRsfafB6yysWucq+pD\n",
"Sd6U5NuttdeMHL8/yfuS3JrkA62194787GOttZ87ZDw1zgDM1cGdNK4/dvA1VzaTl2TwFvr684D+\n",
"mns7uqp6XZLvJfnI1eBcVbcm+UqSNyZ5IcnnkjzUWvvS8OeCMwAACzX3dnSttc8k+c6+w69N8rXW\n",
"2tdba3+d5MkkD1TVRlW9P8mPVtXbjzpJAABYtGnVOL8iyfMj37+R5J7W2qUkj07pHgAAsDDTCs4T\n",
"NYOuqtMjX3dba7sTzQYAjuhmtc7A8qiqrSRb0xpvWsH5hSR3jXy/K4O3zp201k5PaR4AcGR7m6Oc\n",
"vdpd40RV2eAEltTwZezu1e9V9dgk400rOH8+yauq6u4k30zy5iQPTWlsAJiTAzdHscEJkOQIiwOr\n",
"6okkn03y6qp6vqp+qbX2N0nelsEvli8m+ejVjhodxzw9fJUOAABTVVVb+0qDjzbOuO3opk07OgD6\n",
"Yq9U49zoRigHlmro6wzLZ+59nKdNcAagT7osDtwL2I8cT84neXz4k8ODNrB4gjMAzFnV5oXk7L3J\n",
"U0l+Kns10eeTnHqmtYv3LW52wGEmzZ3TWhw4kWHNiTZ0AABM3bTa0nnjDABjUqoBy0mpBgAsgMWB\n",
"sHwEZwAA6GDS3Dl2H2cAAFhHFgcCALDSLA4EAIAxKNUAgB6oqu2qlz1btfkXVS9/drB4EFgl3jgD\n",
"wIQGIfn2TyS3HxtpTXclufyALhvQHyuxAQoALLeNneTVx5JHs7eLYI4lp3aSCM6wInoRnC0OBABg\n",
"ViwOBICeUKoBy8EGKADQA4PwfMevJ8demXz/z5LvvkNohn4RnAGgZ/a2406SS7vJxlaXrbmvDd9/\n",
"9Z3kpZdt5Q3TIzgDQI8Mwu+dH0/OHU+eS/J7SR5Jcj4jZRwvJpcfHA3D15Z7PHzT84Hx6eMMAL2y\n",
"sTMIzQ8n+dMk54b/fDyDYw9n8POrb6RHr/snwxrpLucD86arBgAAK01XDQDoIaUa0F9qnAGgZywO\n",
"hH4SnAEAoAOLAwEAYA4EZwC4garartq8MPjU9qLnAyyOUg0AOMS1C/0Si/RguU2aO3vRjg4A+mlj\n",
"Jzk77MmcJDmenNpJIjjDGupFcNbHGQCAWdHHGQBmTKkGrBbt6ABghvb1ZNZPGZaY4AwAAB3o4wwA\n",
"AHMgOAMAQAeCMwAAdCA4A8AcDXYifNmzVZt/UXXHVwefv///qjYuV7382XntTmhHRBifxYEAMCeD\n",
"gHr7J5Lbjw02Vfm94U9emuTx4Z9PXkkuPzDL7h3a7LGu7BwIAEtjYyd59bHk0SRPJfmR4fFHM7I7\n",
"4bHZ705oR0Q4il4EZzsHAgAwK3YOBIAlo1QDFssGKACwRAah9Y5fT469Mvmr7wyO3vKDyUsqaV9N\n",
"vvuOeQRYOyKyjgRnAADowM6BAAAwB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeCMwAAdCA4A8CMVNV2\n",
"1eaFwae2J7z+HfvHOmj8Se8JHE4fZwCYgUl357v2+ucy2GXwXEbGek9y5zv3jX/QMTsCwtCkufO2\n",
"aU4GALhqYyc5e3ywtXaS5HhyaidJxxA7ev3PZBCarxnr1AHjH3RsjHsCN9KL4FxVp5PsttZ2FzwV\n",
"AABWTFVtJdmaeBylGgAwfUo1oH8mzZ2CMwDMyCD8buwMvl06M26A3Xf9brKxNTrWQeNPek9YZYIz\n",
"AAB0MGnu1I4OAAA6EJwBAKADwRkAADoQnAEAoAPBGQAAOhCcAQCgA8EZAAA6EJwBAKADwRkAADoQ\n",
"nAEAoAPBGQAAOhCcAQCgA8EZAAA6EJwBAKADwRkAADq4bZaDV9UdSX47yZUku621/znL+wEAwKzM\n",
"+o3zTyf5g9bav07yUzO+FwAAzMzYwbmqPlRV36qq5/Ydv7+qvlxVX62qtw8PvyLJ88M//+2EcwWA\n",
"tVZV21WbFwaf2r7Rz2507s3GHBx72bNVm39R9fJnD7r+RufsjfmyZwc/u/EcYFlUa228C6pel+R7\n",
"ST7SWnvN8NitSb6S5I1JXkjyuSQPJfmnSb7TWvtUVT3RWnvogPFaa60m+2sAwGobBM87P56cOz44\n",
"cvLF5PKDrbWnr//ZW68MqjHPHdt/bocx35Pc/q7k9mPJ48MzT15JLj9w9frBdbd/4qBzBn++8+PJ\n",
"I8eT8xn5+YFzgHmaNHeOXePcWvtMVd297/Brk3yttfb14aSeTPJAknNJfquq3pTkqaNOEgDY2EnO\n",
"Hk8evnrgeHJqJ8nT1//s/ceSR3PwuTcd81Ty6v3XH7v2+o2dG5yTwZhPZRCabzYHWB7TWhw4WpKR\n",
"JN9Ick9r7S+T/PLNLq6q0yNfd1tru1OaFwAAa6qqtpJsTWu8aQXn8eo99l/c2ukpzQMAVtSlM8nJ\n",
"E0lGyyrOHPyzL1xJTibJsevPvemYZ5MvvCv5tWN75528cu31l84kX/jxw885eWJQqvFrI/c6bA4w\n",
"O8OXsbtXv1fVY5OMN3aN8/Cmdyf55EiN8z9Lcrq1dv/w+39K8v3W2ns7jKXGGQA6GNQWbwzLIS6d\n",
"Ga0X3v+zwT8PPvdmYw6O3fHrybFXJt//s+S77zi4Pvrgc/bGvLKZvCTJLRdvNAeYl0lz57SC820Z\n",
"LA58Q5JvJvmjJA+11r7UYayW5N1RogEAwAyMlGw8NtfgXFVPJHl9ks0k307yrtbah6vqJ5K8L8mt\n",
"ST7YWvuNjuN54wwAwMwt5I3zNAnOAADMw6S5c9Y7BwIAwEqYVleNiQzb0alxBgBg6qbVlk6pBgAA\n",
"a0GpBgAAzIHgDAAAHQjOAADQgcWBAACsNIsDAQBgDBYHAgDAHAjOAADQgeAMAAAdWBwIAMBKszgQ\n",
"AADGYHEgAADMgeAMAAAdCM4AANCB4AwAAB3oqgEAwErTVQMAAMagqwYAAMyB4AwAAB0IzgAA0IHg\n",
"DAAAHQjOAADQgeAMAAAd6OMMAMBK08cZAADGoI8zAADMgeAMAAAdCM4AANCB4AwAAB0IzgAA0IHg\n",
"DAAAHQjOAADQgeAMAAAd2DkQAICVZudAAAAYg50DAQBgDgRnAADoQHAGAIAOBGcAAOhAcAYAgA4E\n",
"ZwAA6EBwBgCADgRnAADoQHAGAIAOBGcAAOhAcAYAgA5uW/QEkqSqTifZba3tLngqAACsmKraSrI1\n",
"8TittYknM9EEqlprrRY6CQAAVt6kuVOpBgAAdCA4AwBAB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeC\n",
"MwAAdCA4AwBAB4IzAAB0IDgDAEAHgjMAAHQgOAMAQAeCMwAAdCA4AwBAB4IzAAB0IDgDAEAHgjMA\n",
"AHQw0+BcVT9UVR+oqo/N8j4AADBrMw3OrbU/ba29ZZb3gFmqqq1FzwEO4tmkrzybrLJOwbmqPlRV\n",
"36qq5/Ydv7+qvlxVX62qt89mirBQW4ueABxia9ETgENsLXoCMCtd3zh/OMn9oweq6tYkvzU8/iNJ\n",
"Hqqqf1RVv1hV/6Wq/sF0p9p/i/j/smdxz0nHPOr141zX5dxpnbMKFvX3nPZ9pzHeUcaY9rPZ5TzP\n",
"5vLddxG/O8e9xu/O8fj3+mTXr+Lvzk7BubX2mSTf2Xf4tUm+1lr7emvtr5M8meSB1tp/b639h9ba\n",
"N6tqo6ren+RH1+SN9NaK3HPSMY96/TjXdTl3Wuesgq0Vue80xjvKGONc0/Xcm503zj2X2dYK3XfS\n",
"MY9y/bjXdDl/Wuesgq0VueekYx71+nGu63ruzc4b555jq9ZatxOr7k7yydbaa4bffzbJdmvtkeH3\n",
"X0hyT2vtV8eaQFW3CQAAwIRaa3XUa2+b5L4TXLs3yASTBwCAeZmkq8YLSe4a+X5Xkm9MNh0AAOin\n",
"SYLz55O8qqrurqqXJnlzkqemMy0AAOiXru3onkjy2SSvrqrnq+qXWmt/k+RtSZ5O8sUkH22tfWl2\n",
"UwUAgMXpvDgQAADW2Ux3DpyE7brpm6q6o6rOV9XvVtW/WPR8YJTfmfRVVT0w/L35ZFXdu+j5wKiq\n",
"+uGq+m9V9QdV9a9uen7f3zhX1cdaaz+36HlAVf1ikkuttU9V1ZOttZ9f9JxgP78z6auq+oEkj7fW\n",
"3rLoucB+VXVLkidba//8RufN/I2z7brpszGfz1ckeX7457+d60RZS35/0ldHfDbfmcGOwzBT4z6f\n",
"VfWTST6VwWZ+NzSPUg3bddNnnZ/PDNotXm3B2NsyJ1bKOM8nzNM4/26vqnpvkv/dWvuT+U+VNTTW\n",
"787W2idbaz+R5OGbDTzzf/nbrps+G+f5TPK/kvxMVf12tF5kDsZ5Pv3OZJ7G/N35tiRvSPKzVfVv\n",
"5jtT1tGYvztfX1X/tap+J8kf3mzsSXYOnMTof/JOBm/y7hk9obV2Kcmj85wUDB34fLbW/jLJLy9m\n",
"SvB3Dns+/c5k0Q57Nn81yW8uZkrwdw57Pj+d5NNdB1nUf27u94pE1p3nkz7zfNJXnk36bCrP56KC\n",
"s+266TPPJ33m+aSvPJv02VSez0UFZ9t102eeT/rM80lfeTbps6k8n/NoR2e7bnrL80mfeT7pK88m\n",
"fTbL57P3G6AAAEAf6EULAAAdCM4AANCB4AwAAB0IzgAA0IHgDAAAHQjOAADQgeAMAAAdCM4AANCB\n",
"4AwAAB38f9CGVdgyBN6eAAAAAElFTkSuQmCC\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"deg_dist = prob_dist(nx.degree(subgraph).values())\n",
"plt.scatter(deg_dist.index, deg_dist)\n",
"plt.xscale(\"log\")\n",
"plt.yscale(\"log\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Write to txtfile\n",
"with open('output.txt', 'w') as f:\n",
" for s, t in subgraph.edges():\n",
" f.write(str(s) + '\\t' + str(t) + '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"#### Load the social network into Titan using the gizmo driver.\n",
"\n",
"**Note:** This is just a quick demo to try out the driver, I've made no attempt to do any sort of batch loading/server tuning.\n",
"\n",
"You can get the Titan 0.0.9 [here](https://groups.google.com/forum/#!msg/aureliusgraphs/_onvDrvBEwk/lHCNMqefzacJ). It comes packaged with the TP3 Gremlin Server, unpack and...\n",
"\n",
"Fire up the Gremlin Server:\n",
"\n",
"```bash\n",
"./bin/gremlin-server.sh \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def build_schema(gc):\n",
" script = \"\"\"\n",
" mgmt = g.openManagement();\n",
" uniqueId = mgmt.makePropertyKey('uniqueId').dataType(Integer.class).make();\n",
" mgmt.buildIndex('byId', Vertex.class).addKey(uniqueId).unique().buildCompositeIndex();\n",
" collabs=mgmt.makeEdgeLabel('collabs').make();\n",
" mgmt.commit();\"\"\"\n",
" task = gc.task(gc.submit, script,\n",
" consumer=lambda x: print(\"Commited tx with response code: {}\".format(x.status_code)))\n",
" gc.run_until_complete(task)\n",
" \n",
"\n",
"def load_edges(gc):\n",
" start = datetime.now()\n",
" script = \"\"\" \n",
" getOrCreate = { id ->\n",
" def n = g.V().has('uniqueId', id)\n",
" if (n.hasNext()) {n.next()} else {g.addVertex(\"uniqueId\", id)}\n",
" }\n",
"\n",
" new File('output.txt').eachLine { \n",
" (fromVertex, toVertex) = it.split('\\t').collect(getOrCreate)\n",
" fromVertex.addEdge('collabs', toVertex)\n",
" }\n",
"\n",
" g.tx().commit()\"\"\"\n",
" task = gc.task(gc.submit, script,\n",
" consumer=lambda x: print(\"Commited tx with response code: {}\".format(x.status_code)))\n",
" gc.run_until_complete(task)\n",
" print(\"Loaded in {}\".format(datetime.now() - start))\n",
"\n",
" \n",
"@asyncio.coroutine\n",
"def count_nodes(gc):\n",
" yield from gc.submit(\"g.V().count()\", collect=False, consumer=lambda x: print(x))\n",
"\n",
" \n",
"@asyncio.coroutine\n",
"def count_edges(gc):\n",
" yield from gc.submit(\"g.E().count()\", collect=False, consumer=lambda x: print(x))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"gc = AsyncGremlinClient()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Commited tx with response code: 200\n"
]
}
],
"source": [
"build_schema(gc)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Commited tx with response code: 200\n",
"Loaded in 0:00:16.713325\n"
]
}
],
"source": [
"load_edges(gc)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[16264]\n"
]
}
],
"source": [
"gc.run_until_complete(count_nodes(gc))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[47594]\n"
]
}
],
"source": [
"gc.run_until_complete(count_edges(gc))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"####(I'm sure you can do that much faster with some config tricks, I'm not exactly a Gremlin expert either...)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}