{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting DWPC Query runtime ahead of time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import json\n", "\n", "import matplotlib.pyplot\n", "import pandas\n", "import numpy\n", "import seaborn\n", "import mpld3\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = '../all-features/data/metapaths.json'\n", "with open(path) as fp:\n", " metapaths = json.load(fp)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hetnetcompound_iddisease_idmetapathPCwDWPCseconds
0hetio-ind_perm-5DB00014DOID:0060073CpDpCpD00.40.01.016
1hetio-indDB00014DOID:1612CpDpCpD00.40.01.067
\n", "
" ], "text/plain": [ " hetnet compound_id disease_id metapath PC w DWPC seconds\n", "0 hetio-ind_perm-5 DB00014 DOID:0060073 CpDpCpD 0 0.4 0.0 1.016\n", "1 hetio-ind DB00014 DOID:1612 CpDpCpD 0 0.4 0.0 1.067" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dwpc_df = pandas.read_table('../all-features/data/dwpc.tsv.bz2')\n", "dwpc_df.head(2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "22933125" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of queries\n", "len(dwpc_df)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "1215" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "time_df = dwpc_df.groupby('metapath').seconds.mean().reset_index()\n", "len(time_df)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cols = ['sequential_complexity', 'optimal_join_complexity', 'midpoint_join_complexity']\n", "\n", "rows = [[\n", " item['abbreviation'], \n", " item['join_complexities'][item['midpoint_index']], \n", " item['join_complexities'][item['optimal_join_index']],\n", " item['join_complexities'][-1],\n", " item['join_complexities'][0],\n", " ] for item in metapaths]\n", "complexity_df = pandas.DataFrame(rows, columns=\n", " ['metapath', 'midpoint_complexity', 'optimal_complexity', 'forward_complexity', 'backward_complexity'])\n", "complexity_df = time_df.merge(complexity_df)\n", "complexity_df['log10_seconds_per_query'] = numpy.log10(complexity_df['seconds'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
metapathsecondsmidpoint_complexityoptimal_complexityforward_complexitybackward_complexitylog10_seconds_per_query
0CbG<rG<rGaD0.1669663.101502.8590922.8590923.913263-0.777372
1CbG<rG<rGdD0.0961232.903282.6400562.6400563.694227-1.017172
\n", "
" ], "text/plain": [ " metapath seconds midpoint_complexity optimal_complexity \\\n", "0 CbG\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matplotlib.pyplot.figure(figsize=(10, 7))\n", "ax = seaborn.regplot('forward_complexity', 'log10_seconds_per_query', data=complexity_df,\n", " lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)\n", "points = ax.collections[0]\n", "labels = complexity_df.metapath.tolist()\n", "tooltip = mpld3.plugins.PointLabelTooltip(points, labels)\n", "mpld3.plugins.connect(ax.figure, tooltip)\n", "mpld3.display()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## optimal join complexity" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matplotlib.pyplot.figure(figsize=(10, 7))\n", "ax = seaborn.regplot('optimal_complexity', 'log10_seconds_per_query', data=complexity_df,\n", " lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)\n", "points = ax.collections[0]\n", "labels = complexity_df.metapath.tolist()\n", "tooltip = mpld3.plugins.PointLabelTooltip(points, labels)\n", "mpld3.plugins.connect(ax.figure, tooltip)\n", "mpld3.display()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## midpoint_join_complexity" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matplotlib.pyplot.figure(figsize=(10, 7))\n", "ax = seaborn.regplot('midpoint_complexity', 'log10_seconds_per_query', data=complexity_df,\n", " lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)\n", "points = ax.collections[0]\n", "labels = complexity_df.metapath.tolist()\n", "tooltip = mpld3.plugins.PointLabelTooltip(points, labels)\n", "mpld3.plugins.connect(ax.figure, tooltip)\n", "mpld3.display()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }