{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collapsed source/target edge contributions to epilepsy predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import itertools\n",
    "import collections\n",
    "\n",
    "import pandas\n",
    "\n",
    "from utilities import tidy_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read Project Rephetio DrugBank Info\n",
    "url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'\n",
    "drugbank_df = (pandas.read_table(url)\n",
    "    .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})\n",
    "    [['compound_id', 'compound_name', 'atc_codes', 'categories']]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>compound_name</th>\n",
       "      <th>disease_pctl</th>\n",
       "      <th>phcodb</th>\n",
       "      <th>compound_id</th>\n",
       "      <th>atc_codes</th>\n",
       "      <th>categories</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Topiramate</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>DM</td>\n",
       "      <td>DB00273</td>\n",
       "      <td>N03AX11</td>\n",
       "      <td>Anticonvulsants|Anti-Obesity Agents|Neuroprote...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ethotoin</td>\n",
       "      <td>0.9993</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DB00754</td>\n",
       "      <td>N03AB01</td>\n",
       "      <td>Anticonvulsants</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  compound_name  disease_pctl phcodb compound_id atc_codes  \\\n",
       "0    Topiramate        1.0000     DM     DB00273   N03AX11   \n",
       "1      Ethotoin        0.9993    NaN     DB00754   N03AB01   \n",
       "\n",
       "                                          categories  \n",
       "0  Anticonvulsants|Anti-Obesity Agents|Neuroprote...  \n",
       "1                                    Anticonvulsants  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read top epilepsy predictions\n",
    "top_compounds_df = (pandas.read_table('./data/windows.tsv')\n",
    "    .rename(columns={'name': 'compound_name'})\n",
    "    [['compound_name', 'disease_pctl', 'phcodb']]\n",
    "    .merge(drugbank_df)\n",
    ")\n",
    "top_compounds_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nodes</th>\n",
       "      <th>percent_of_prediction</th>\n",
       "      <th>percent_of_DWPC</th>\n",
       "      <th>source_edge</th>\n",
       "      <th>target_edge</th>\n",
       "      <th>metapath</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Topiramate—migraine—epilepsy syndrome</td>\n",
       "      <td>0.1780</td>\n",
       "      <td>1.000</td>\n",
       "      <td>Topiramate—treats—migraine</td>\n",
       "      <td>epilepsy syndrome—resembles—migraine</td>\n",
       "      <td>CtDrD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Topiramate—GRIK5—epilepsy syndrome</td>\n",
       "      <td>0.0385</td>\n",
       "      <td>0.249</td>\n",
       "      <td>Topiramate—binds—GRIK5</td>\n",
       "      <td>epilepsy syndrome—associates—GRIK5</td>\n",
       "      <td>CbGaD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   nodes  percent_of_prediction  \\\n",
       "0  Topiramate—migraine—epilepsy syndrome                 0.1780   \n",
       "1     Topiramate—GRIK5—epilepsy syndrome                 0.0385   \n",
       "\n",
       "   percent_of_DWPC                 source_edge  \\\n",
       "0            1.000  Topiramate—treats—migraine   \n",
       "1            0.249      Topiramate—binds—GRIK5   \n",
       "\n",
       "                            target_edge metapath  \n",
       "0  epilepsy syndrome—resembles—migraine    CtDrD  \n",
       "1    epilepsy syndrome—associates—GRIK5    CbGaD  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path_dfs = list()\n",
    "for compound_id in top_compounds_df.compound_id:\n",
    "    path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)\n",
    "    path_dfs.append(pandas.read_table(path))\n",
    "path_df = pandas.concat(path_dfs)\n",
    "path_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def summarize(df):\n",
    "    s = pandas.Series()\n",
    "    s['paths'] = len(df)\n",
    "    s['contribution'] = sum(df.percent_of_prediction)\n",
    "    return s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Metapath contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>metapath</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CbGbCtD</td>\n",
       "      <td>6358.0</td>\n",
       "      <td>20.623795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>CrCtD</td>\n",
       "      <td>160.0</td>\n",
       "      <td>18.265600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   metapath   paths  contribution\n",
       "1   CbGbCtD  6358.0     20.623795\n",
       "10    CrCtD   160.0     18.265600"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metapath_df = (path_df\n",
    "    .groupby('metapath')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")\n",
    "metapath_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metapath_df.to_csv('data/metapath-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "len(metapath_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Source edge contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_edge</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1437</th>\n",
       "      <td>Compound—includes—Decreased Central Nervous Sy...</td>\n",
       "      <td>238.0</td>\n",
       "      <td>6.341200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1429</th>\n",
       "      <td>Compound—includes—Benzodiazepines</td>\n",
       "      <td>52.0</td>\n",
       "      <td>3.844600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>Compound—binds—GABRA1</td>\n",
       "      <td>12385.0</td>\n",
       "      <td>2.819223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1519</th>\n",
       "      <td>Compound—resembles—Diazepam</td>\n",
       "      <td>402.0</td>\n",
       "      <td>2.708075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1438</th>\n",
       "      <td>Compound—includes—General Anesthesia</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2.456000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            source_edge    paths  contribution\n",
       "1437  Compound—includes—Decreased Central Nervous Sy...    238.0      6.341200\n",
       "1429                  Compound—includes—Benzodiazepines     52.0      3.844600\n",
       "104                               Compound—binds—GABRA1  12385.0      2.819223\n",
       "1519                        Compound—resembles—Diazepam    402.0      2.708075\n",
       "1438               Compound—includes—General Anesthesia      6.0      2.456000"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_df = (path_df\n",
    "    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))\n",
    "    .groupby('source_edge')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")\n",
    "source_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1667"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_df.to_csv('data/source-edge-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "len(source_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_edge</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Compound—binds</td>\n",
       "      <td>266192.0</td>\n",
       "      <td>43.778627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Compound—resembles</td>\n",
       "      <td>8507.0</td>\n",
       "      <td>30.306896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Compound—includes</td>\n",
       "      <td>322.0</td>\n",
       "      <td>15.050900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Compound—palliates</td>\n",
       "      <td>212.0</td>\n",
       "      <td>5.319900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Compound—causes</td>\n",
       "      <td>117724.0</td>\n",
       "      <td>4.418177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Compound—treats</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.130000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          source_edge     paths  contribution\n",
       "0      Compound—binds  266192.0     43.778627\n",
       "4  Compound—resembles    8507.0     30.306896\n",
       "2   Compound—includes     322.0     15.050900\n",
       "3  Compound—palliates     212.0      5.319900\n",
       "1     Compound—causes  117724.0      4.418177\n",
       "5     Compound—treats       5.0      1.130000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Source metaedge contributions\n",
    "(path_df\n",
    "    .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))\n",
    "    .groupby('source_edge')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Target edge contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target_edge</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>355</th>\n",
       "      <td>epilepsy syndrome—treats—Diazepam</td>\n",
       "      <td>6843.0</td>\n",
       "      <td>8.123404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>epilepsy syndrome—treats—Clonazepam</td>\n",
       "      <td>6488.0</td>\n",
       "      <td>6.273890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>epilepsy syndrome—treats—Midazolam</td>\n",
       "      <td>4832.0</td>\n",
       "      <td>6.116992</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>epilepsy syndrome—treats—Clobazam</td>\n",
       "      <td>4159.0</td>\n",
       "      <td>5.670810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>epilepsy syndrome—treats—Amobarbital</td>\n",
       "      <td>2002.0</td>\n",
       "      <td>4.840363</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              target_edge   paths  contribution\n",
       "355     epilepsy syndrome—treats—Diazepam  6843.0      8.123404\n",
       "354   epilepsy syndrome—treats—Clonazepam  6488.0      6.273890\n",
       "362    epilepsy syndrome—treats—Midazolam  4832.0      6.116992\n",
       "353     epilepsy syndrome—treats—Clobazam  4159.0      5.670810\n",
       "351  epilepsy syndrome—treats—Amobarbital  2002.0      4.840363"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_df = (path_df\n",
    "    .groupby('target_edge')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")\n",
    "target_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "375"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_df.to_csv('data/target-edge-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "len(target_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target_edge</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>epilepsy syndrome—treats</td>\n",
       "      <td>127343.0</td>\n",
       "      <td>75.582070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>epilepsy syndrome—associates</td>\n",
       "      <td>255092.0</td>\n",
       "      <td>21.689669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>epilepsy syndrome—localizes</td>\n",
       "      <td>10522.0</td>\n",
       "      <td>1.602761</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>epilepsy syndrome—resembles</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.130000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    target_edge     paths  contribution\n",
       "3      epilepsy syndrome—treats  127343.0     75.582070\n",
       "0  epilepsy syndrome—associates  255092.0     21.689669\n",
       "1   epilepsy syndrome—localizes   10522.0      1.602761\n",
       "2   epilepsy syndrome—resembles       5.0      1.130000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Source metaedge contributions\n",
    "(path_df\n",
    "    .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))\n",
    "    .groupby('target_edge')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Anatomy (intermediate node) contributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>anatomy</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>telencephalon</td>\n",
       "      <td>928.0</td>\n",
       "      <td>0.153357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>forebrain</td>\n",
       "      <td>834.0</td>\n",
       "      <td>0.147358</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          anatomy  paths  contribution\n",
       "21  telencephalon  928.0      0.153357\n",
       "7       forebrain  834.0      0.147358"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "anatomy_df = path_df.query(\"metapath == 'CbGeAlD'\").copy()\n",
    "anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])\n",
    "anatomy_df = (anatomy_df\n",
    "    .groupby('anatomy')\n",
    "    .apply(summarize).reset_index()\n",
    "    .sort_values('contribution', ascending=False)\n",
    ")\n",
    "anatomy_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "len(anatomy_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classes of predicted compounds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_counts(df):\n",
    "    s = pandas.Series()\n",
    "    s['count'] = len(df)\n",
    "    s['compounds'] = ', '.join(sorted(df.compound_name))\n",
    "    return s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Third-level ATC Codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "91 compounds have at least 1 ATC code\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>atc_code</th>\n",
       "      <th>atc_name</th>\n",
       "      <th>count</th>\n",
       "      <th>compounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>N03A</td>\n",
       "      <td>antiepileptics</td>\n",
       "      <td>25</td>\n",
       "      <td>Carbamazepine, Clonazepam, Ethosuximide, Ethot...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>N05C</td>\n",
       "      <td>hypnotics and sedatives</td>\n",
       "      <td>21</td>\n",
       "      <td>Amobarbital, Aprobarbital, Cinolazepam, Estazo...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   atc_code                 atc_name  count  \\\n",
       "10     N03A           antiepileptics     25   \n",
       "13     N05C  hypnotics and sedatives     21   \n",
       "\n",
       "                                            compounds  \n",
       "10  Carbamazepine, Clonazepam, Ethosuximide, Ethot...  \n",
       "13  Amobarbital, Aprobarbital, Cinolazepam, Estazo...  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Anatomical Therapeutic Chemical Classification System\n",
    "\n",
    "# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907\n",
    "url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'\n",
    "atc_df = (pandas.read_table(url)\n",
    "    .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})\n",
    ")\n",
    "df = tidy_split(top_compounds_df, 'atc_codes')\n",
    "print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))\n",
    "\n",
    "# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.\n",
    "df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)\n",
    "df = df.drop_duplicates()\n",
    "df = df.merge(atc_df, how='left')\n",
    "df.atc_name = df.atc_name.str.lower()\n",
    "df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
    "df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DrugBank Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84 compounds have at least 1 category\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>count</th>\n",
       "      <th>compounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Anticonvulsants</td>\n",
       "      <td>28</td>\n",
       "      <td>Acetazolamide, Carbamazepine, Clobazam, Clonaz...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>Hypnotics and Sedatives</td>\n",
       "      <td>24</td>\n",
       "      <td>Alprazolam, Butabarbital, Butethal, Chlordiaze...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   category  count  \\\n",
       "16          Anticonvulsants     28   \n",
       "46  Hypnotics and Sedatives     24   \n",
       "\n",
       "                                            compounds  \n",
       "16  Acetazolamide, Carbamazepine, Clobazam, Clonaz...  \n",
       "46  Alprazolam, Butabarbital, Butethal, Chlordiaze...  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = tidy_split(top_compounds_df, 'categories')\n",
    "print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))\n",
    "df = df.rename(columns={'categories': 'category'})\n",
    "df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
    "df.to_csv('data/compounds-categories.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DurgCentral Pharmacologic Classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>compound_id</th>\n",
       "      <th>class_id</th>\n",
       "      <th>class_name</th>\n",
       "      <th>class_source</th>\n",
       "      <th>class_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DB00126</td>\n",
       "      <td>CHEBI:21241</td>\n",
       "      <td>vitamin C</td>\n",
       "      <td>CHEBI</td>\n",
       "      <td>Application</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DB00676</td>\n",
       "      <td>CHEBI:22153</td>\n",
       "      <td>acaricide</td>\n",
       "      <td>CHEBI</td>\n",
       "      <td>Application</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  compound_id     class_id class_name class_source   class_type\n",
       "0     DB00126  CHEBI:21241  vitamin C        CHEBI  Application\n",
       "1     DB00676  CHEBI:22153  acaricide        CHEBI  Application"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'\n",
    "class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'\n",
    "class_df = (pandas.read_table(class_url)\n",
    "    .merge(pandas.read_table(class_rel_url))\n",
    "    .rename(columns={'drugbank_id': 'compound_id'})\n",
    "    [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]\n",
    ")\n",
    "\n",
    "class_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "92 compounds have at least 1 pharmacologic class\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>class_id</th>\n",
       "      <th>class_name</th>\n",
       "      <th>class_source</th>\n",
       "      <th>class_type</th>\n",
       "      <th>count</th>\n",
       "      <th>compounds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>D002491</td>\n",
       "      <td>Central Nervous System Agents</td>\n",
       "      <td>MeSH</td>\n",
       "      <td>Pharmacological Action</td>\n",
       "      <td>76</td>\n",
       "      <td>Acamprosate, Acetazolamide, Adinazolam, Alpraz...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>D002492</td>\n",
       "      <td>Central Nervous System Depressants</td>\n",
       "      <td>MeSH</td>\n",
       "      <td>Pharmacological Action</td>\n",
       "      <td>46</td>\n",
       "      <td>Alprazolam, Amobarbital, Bromazepam, Butabarbi...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   class_id                          class_name class_source  \\\n",
       "89  D002491       Central Nervous System Agents         MeSH   \n",
       "90  D002492  Central Nervous System Depressants         MeSH   \n",
       "\n",
       "                class_type  count  \\\n",
       "89  Pharmacological Action     76   \n",
       "90  Pharmacological Action     46   \n",
       "\n",
       "                                            compounds  \n",
       "89  Acamprosate, Acetazolamide, Adinazolam, Alpraz...  \n",
       "90  Alprazolam, Amobarbital, Bromazepam, Butabarbi...  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = top_compounds_df.merge(class_df)\n",
    "print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))\n",
    "df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
    "df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Contribution by gene groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read Entrez Gene\n",
    "url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'\n",
    "gene_df = pandas.read_table(url)\n",
    "symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668\n",
    "gene_split = re.compile(r',\\s*(?![^()]*\\))')\n",
    "\n",
    "def summarize(df):\n",
    "    s = pandas.Series()\n",
    "    s['paths'] = sum(df.paths)\n",
    "    s['contribution'] = sum(df.contribution)\n",
    "    s['gene_symbols'] = ', '.join(df.gene_symbol)\n",
    "    return s\n",
    "\n",
    "def contributions_by_gene(df, edge_column):\n",
    "    df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])\n",
    "    df['gene_name'] = df['gene_symbol'].map(symbol_to_name)\n",
    "    df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])\n",
    "    return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene_main_name</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "      <th>gene_symbols</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>gamma-aminobutyric acid (GABA) A receptor</td>\n",
       "      <td>91967.0</td>\n",
       "      <td>15.329433</td>\n",
       "      <td>GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>cytochrome P450</td>\n",
       "      <td>34323.0</td>\n",
       "      <td>5.585686</td>\n",
       "      <td>CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               gene_main_name    paths  contribution  \\\n",
       "74  gamma-aminobutyric acid (GABA) A receptor  91967.0     15.329433   \n",
       "66                            cytochrome P450  34323.0      5.585686   \n",
       "\n",
       "                                         gene_symbols  \n",
       "74  GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,...  \n",
       "66  CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C...  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()\n",
    "source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')\n",
    "source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "source_bind_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene_main_name</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "      <th>gene_symbols</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>gamma-aminobutyric acid (GABA) A receptor</td>\n",
       "      <td>23347.0</td>\n",
       "      <td>6.834028</td>\n",
       "      <td>GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>glutamate receptor</td>\n",
       "      <td>20142.0</td>\n",
       "      <td>2.284863</td>\n",
       "      <td>GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                gene_main_name    paths  contribution  \\\n",
       "119  gamma-aminobutyric acid (GABA) A receptor  23347.0      6.834028   \n",
       "128                         glutamate receptor  20142.0      2.284863   \n",
       "\n",
       "                                          gene_symbols  \n",
       "119      GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD  \n",
       "128  GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI...  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()\n",
    "target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')\n",
    "target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
    "target_bind_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Contribution by Side Effect (source edges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_edge</th>\n",
       "      <th>paths</th>\n",
       "      <th>contribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>390</th>\n",
       "      <td>Compound—causes—Ataxia</td>\n",
       "      <td>1312.0</td>\n",
       "      <td>0.069241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1057</th>\n",
       "      <td>Compound—causes—Nystagmus</td>\n",
       "      <td>611.0</td>\n",
       "      <td>0.048500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>579</th>\n",
       "      <td>Compound—causes—Diplopia</td>\n",
       "      <td>948.0</td>\n",
       "      <td>0.044986</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1278</th>\n",
       "      <td>Compound—causes—Somnolence</td>\n",
       "      <td>1577.0</td>\n",
       "      <td>0.043543</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1416</th>\n",
       "      <td>Compound—causes—Vomiting</td>\n",
       "      <td>1777.0</td>\n",
       "      <td>0.042753</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     source_edge   paths  contribution\n",
       "390       Compound—causes—Ataxia  1312.0      0.069241\n",
       "1057   Compound—causes—Nystagmus   611.0      0.048500\n",
       "579     Compound—causes—Diplopia   948.0      0.044986\n",
       "1278  Compound—causes—Somnolence  1577.0      0.043543\n",
       "1416    Compound—causes—Vomiting  1777.0      0.042753"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]\n",
    "side_effect_df.head()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}