{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Collapsed source/target edge contributions to epilepsy predictions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"import itertools\n",
"import collections\n",
"\n",
"import pandas\n",
"\n",
"from utilities import tidy_split"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read Project Rephetio DrugBank Info\n",
"url = 'https://github.com/dhimmel/drugbank/raw/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank-slim.tsv'\n",
"drugbank_df = (pandas.read_table(url)\n",
" .rename(columns={'drugbank_id': 'compound_id', 'name': 'compound_name'})\n",
" [['compound_id', 'compound_name', 'atc_codes', 'categories']]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" compound_name | \n",
" disease_pctl | \n",
" phcodb | \n",
" compound_id | \n",
" atc_codes | \n",
" categories | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Topiramate | \n",
" 1.0000 | \n",
" DM | \n",
" DB00273 | \n",
" N03AX11 | \n",
" Anticonvulsants|Anti-Obesity Agents|Neuroprote... | \n",
"
\n",
" \n",
" 1 | \n",
" Ethotoin | \n",
" 0.9993 | \n",
" NaN | \n",
" DB00754 | \n",
" N03AB01 | \n",
" Anticonvulsants | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" compound_name disease_pctl phcodb compound_id atc_codes \\\n",
"0 Topiramate 1.0000 DM DB00273 N03AX11 \n",
"1 Ethotoin 0.9993 NaN DB00754 N03AB01 \n",
"\n",
" categories \n",
"0 Anticonvulsants|Anti-Obesity Agents|Neuroprote... \n",
"1 Anticonvulsants "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read top epilepsy predictions\n",
"top_compounds_df = (pandas.read_table('./data/windows.tsv')\n",
" .rename(columns={'name': 'compound_name'})\n",
" [['compound_name', 'disease_pctl', 'phcodb']]\n",
" .merge(drugbank_df)\n",
")\n",
"top_compounds_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" nodes | \n",
" percent_of_prediction | \n",
" percent_of_DWPC | \n",
" source_edge | \n",
" target_edge | \n",
" metapath | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Topiramate—migraine—epilepsy syndrome | \n",
" 0.1780 | \n",
" 1.000 | \n",
" Topiramate—treats—migraine | \n",
" epilepsy syndrome—resembles—migraine | \n",
" CtDrD | \n",
"
\n",
" \n",
" 1 | \n",
" Topiramate—GRIK5—epilepsy syndrome | \n",
" 0.0385 | \n",
" 0.249 | \n",
" Topiramate—binds—GRIK5 | \n",
" epilepsy syndrome—associates—GRIK5 | \n",
" CbGaD | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" nodes percent_of_prediction \\\n",
"0 Topiramate—migraine—epilepsy syndrome 0.1780 \n",
"1 Topiramate—GRIK5—epilepsy syndrome 0.0385 \n",
"\n",
" percent_of_DWPC source_edge \\\n",
"0 1.000 Topiramate—treats—migraine \n",
"1 0.249 Topiramate—binds—GRIK5 \n",
"\n",
" target_edge metapath \n",
"0 epilepsy syndrome—resembles—migraine CtDrD \n",
"1 epilepsy syndrome—associates—GRIK5 CbGaD "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_dfs = list()\n",
"for compound_id in top_compounds_df.compound_id:\n",
" path = '../../het.io-rep-data/prediction-info/{}/DOID_1826/paths.tsv'.format(compound_id)\n",
" path_dfs.append(pandas.read_table(path))\n",
"path_df = pandas.concat(path_dfs)\n",
"path_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def summarize(df):\n",
" s = pandas.Series()\n",
" s['paths'] = len(df)\n",
" s['contribution'] = sum(df.percent_of_prediction)\n",
" return s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metapath contributions"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" metapath | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" CbGbCtD | \n",
" 6358.0 | \n",
" 20.623795 | \n",
"
\n",
" \n",
" 10 | \n",
" CrCtD | \n",
" 160.0 | \n",
" 18.265600 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" metapath paths contribution\n",
"1 CbGbCtD 6358.0 20.623795\n",
"10 CrCtD 160.0 18.265600"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metapath_df = (path_df\n",
" .groupby('metapath')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")\n",
"metapath_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"12"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metapath_df.to_csv('data/metapath-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"len(metapath_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Source edge contributions"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source_edge | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 1437 | \n",
" Compound—includes—Decreased Central Nervous Sy... | \n",
" 238.0 | \n",
" 6.341200 | \n",
"
\n",
" \n",
" 1429 | \n",
" Compound—includes—Benzodiazepines | \n",
" 52.0 | \n",
" 3.844600 | \n",
"
\n",
" \n",
" 104 | \n",
" Compound—binds—GABRA1 | \n",
" 12385.0 | \n",
" 2.819223 | \n",
"
\n",
" \n",
" 1519 | \n",
" Compound—resembles—Diazepam | \n",
" 402.0 | \n",
" 2.708075 | \n",
"
\n",
" \n",
" 1438 | \n",
" Compound—includes—General Anesthesia | \n",
" 6.0 | \n",
" 2.456000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source_edge paths contribution\n",
"1437 Compound—includes—Decreased Central Nervous Sy... 238.0 6.341200\n",
"1429 Compound—includes—Benzodiazepines 52.0 3.844600\n",
"104 Compound—binds—GABRA1 12385.0 2.819223\n",
"1519 Compound—resembles—Diazepam 402.0 2.708075\n",
"1438 Compound—includes—General Anesthesia 6.0 2.456000"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source_df = (path_df\n",
" .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—', 1)[1]))\n",
" .groupby('source_edge')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")\n",
"source_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1667"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source_df.to_csv('data/source-edge-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"len(source_df)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source_edge | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Compound—binds | \n",
" 266192.0 | \n",
" 43.778627 | \n",
"
\n",
" \n",
" 4 | \n",
" Compound—resembles | \n",
" 8507.0 | \n",
" 30.306896 | \n",
"
\n",
" \n",
" 2 | \n",
" Compound—includes | \n",
" 322.0 | \n",
" 15.050900 | \n",
"
\n",
" \n",
" 3 | \n",
" Compound—palliates | \n",
" 212.0 | \n",
" 5.319900 | \n",
"
\n",
" \n",
" 1 | \n",
" Compound—causes | \n",
" 117724.0 | \n",
" 4.418177 | \n",
"
\n",
" \n",
" 5 | \n",
" Compound—treats | \n",
" 5.0 | \n",
" 1.130000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source_edge paths contribution\n",
"0 Compound—binds 266192.0 43.778627\n",
"4 Compound—resembles 8507.0 30.306896\n",
"2 Compound—includes 322.0 15.050900\n",
"3 Compound—palliates 212.0 5.319900\n",
"1 Compound—causes 117724.0 4.418177\n",
"5 Compound—treats 5.0 1.130000"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Source metaedge contributions\n",
"(path_df\n",
" .assign(source_edge = path_df.source_edge.map(lambda x: 'Compound—' + x.split('—')[1]))\n",
" .groupby('source_edge')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Target edge contributions"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" target_edge | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 355 | \n",
" epilepsy syndrome—treats—Diazepam | \n",
" 6843.0 | \n",
" 8.123404 | \n",
"
\n",
" \n",
" 354 | \n",
" epilepsy syndrome—treats—Clonazepam | \n",
" 6488.0 | \n",
" 6.273890 | \n",
"
\n",
" \n",
" 362 | \n",
" epilepsy syndrome—treats—Midazolam | \n",
" 4832.0 | \n",
" 6.116992 | \n",
"
\n",
" \n",
" 353 | \n",
" epilepsy syndrome—treats—Clobazam | \n",
" 4159.0 | \n",
" 5.670810 | \n",
"
\n",
" \n",
" 351 | \n",
" epilepsy syndrome—treats—Amobarbital | \n",
" 2002.0 | \n",
" 4.840363 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" target_edge paths contribution\n",
"355 epilepsy syndrome—treats—Diazepam 6843.0 8.123404\n",
"354 epilepsy syndrome—treats—Clonazepam 6488.0 6.273890\n",
"362 epilepsy syndrome—treats—Midazolam 4832.0 6.116992\n",
"353 epilepsy syndrome—treats—Clobazam 4159.0 5.670810\n",
"351 epilepsy syndrome—treats—Amobarbital 2002.0 4.840363"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_df = (path_df\n",
" .groupby('target_edge')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")\n",
"target_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"375"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_df.to_csv('data/target-edge-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"len(target_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" target_edge | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" epilepsy syndrome—treats | \n",
" 127343.0 | \n",
" 75.582070 | \n",
"
\n",
" \n",
" 0 | \n",
" epilepsy syndrome—associates | \n",
" 255092.0 | \n",
" 21.689669 | \n",
"
\n",
" \n",
" 1 | \n",
" epilepsy syndrome—localizes | \n",
" 10522.0 | \n",
" 1.602761 | \n",
"
\n",
" \n",
" 2 | \n",
" epilepsy syndrome—resembles | \n",
" 5.0 | \n",
" 1.130000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" target_edge paths contribution\n",
"3 epilepsy syndrome—treats 127343.0 75.582070\n",
"0 epilepsy syndrome—associates 255092.0 21.689669\n",
"1 epilepsy syndrome—localizes 10522.0 1.602761\n",
"2 epilepsy syndrome—resembles 5.0 1.130000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Source metaedge contributions\n",
"(path_df\n",
" .assign(target_edge = path_df.target_edge.map(lambda x: 'epilepsy syndrome—' + x.split('—')[-2]))\n",
" .groupby('target_edge')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Anatomy (intermediate node) contributions"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" anatomy | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 21 | \n",
" telencephalon | \n",
" 928.0 | \n",
" 0.153357 | \n",
"
\n",
" \n",
" 7 | \n",
" forebrain | \n",
" 834.0 | \n",
" 0.147358 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" anatomy paths contribution\n",
"21 telencephalon 928.0 0.153357\n",
"7 forebrain 834.0 0.147358"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"anatomy_df = path_df.query(\"metapath == 'CbGeAlD'\").copy()\n",
"anatomy_df['anatomy'] = anatomy_df.nodes.map(lambda x: x.split('—')[2])\n",
"anatomy_df = (anatomy_df\n",
" .groupby('anatomy')\n",
" .apply(summarize).reset_index()\n",
" .sort_values('contribution', ascending=False)\n",
")\n",
"anatomy_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"24"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"anatomy_df.to_csv('data/anatomy-node-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"len(anatomy_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Classes of predicted compounds"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_counts(df):\n",
" s = pandas.Series()\n",
" s['count'] = len(df)\n",
" s['compounds'] = ', '.join(sorted(df.compound_name))\n",
" return s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Third-level ATC Codes"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"91 compounds have at least 1 ATC code\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" atc_code | \n",
" atc_name | \n",
" count | \n",
" compounds | \n",
"
\n",
" \n",
" \n",
" \n",
" 10 | \n",
" N03A | \n",
" antiepileptics | \n",
" 25 | \n",
" Carbamazepine, Clonazepam, Ethosuximide, Ethot... | \n",
"
\n",
" \n",
" 13 | \n",
" N05C | \n",
" hypnotics and sedatives | \n",
" 21 | \n",
" Amobarbital, Aprobarbital, Cinolazepam, Estazo... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" atc_code atc_name count \\\n",
"10 N03A antiepileptics 25 \n",
"13 N05C hypnotics and sedatives 21 \n",
"\n",
" compounds \n",
"10 Carbamazepine, Clonazepam, Ethosuximide, Ethot... \n",
"13 Amobarbital, Aprobarbital, Cinolazepam, Estazo... "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Anatomical Therapeutic Chemical Classification System\n",
"\n",
"# Read ATC Code to name mapping http://biology.stackexchange.com/a/55023/28907\n",
"url = 'https://github.com/OHDSI/Vocabulary-v4.5/raw/661804cf3c17add61b02e2e83e477f48acb011d5/21-ATC/atc_code.txt'\n",
"atc_df = (pandas.read_table(url)\n",
" .rename(columns={'Code': 'atc_code', 'Description': 'atc_name'})\n",
")\n",
"df = tidy_split(top_compounds_df, 'atc_codes')\n",
"print('{} compounds have at least 1 ATC code'.format(df.compound_name.nunique()))\n",
"\n",
"# The third level of the code indicates the therapeutic/pharmacological subgroup and consists of one letter.\n",
"df['atc_code'] = df.pop('atc_codes').str.slice(0, 4)\n",
"df = df.drop_duplicates()\n",
"df = df.merge(atc_df, how='left')\n",
"df.atc_name = df.atc_name.str.lower()\n",
"df = df.groupby(['atc_code', 'atc_name']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
"df.to_csv('data/compounds-third-level-atc-codes.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DrugBank Categories"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"84 compounds have at least 1 category\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" category | \n",
" count | \n",
" compounds | \n",
"
\n",
" \n",
" \n",
" \n",
" 16 | \n",
" Anticonvulsants | \n",
" 28 | \n",
" Acetazolamide, Carbamazepine, Clobazam, Clonaz... | \n",
"
\n",
" \n",
" 46 | \n",
" Hypnotics and Sedatives | \n",
" 24 | \n",
" Alprazolam, Butabarbital, Butethal, Chlordiaze... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" category count \\\n",
"16 Anticonvulsants 28 \n",
"46 Hypnotics and Sedatives 24 \n",
"\n",
" compounds \n",
"16 Acetazolamide, Carbamazepine, Clobazam, Clonaz... \n",
"46 Alprazolam, Butabarbital, Butethal, Chlordiaze... "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = tidy_split(top_compounds_df, 'categories')\n",
"print('{} compounds have at least 1 category'.format(df.compound_name.nunique()))\n",
"df = df.rename(columns={'categories': 'category'})\n",
"df = df.groupby(['category']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
"df.to_csv('data/compounds-categories.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DurgCentral Pharmacologic Classes"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" compound_id | \n",
" class_id | \n",
" class_name | \n",
" class_source | \n",
" class_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DB00126 | \n",
" CHEBI:21241 | \n",
" vitamin C | \n",
" CHEBI | \n",
" Application | \n",
"
\n",
" \n",
" 1 | \n",
" DB00676 | \n",
" CHEBI:22153 | \n",
" acaricide | \n",
" CHEBI | \n",
" Application | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" compound_id class_id class_name class_source class_type\n",
"0 DB00126 CHEBI:21241 vitamin C CHEBI Application\n",
"1 DB00676 CHEBI:22153 acaricide CHEBI Application"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/classes.tsv'\n",
"class_rel_url = 'https://github.com/dhimmel/drugcentral/raw/e80a0c966a53ce48650d98069b126801c2793517/rephetio/drug-to-class.tsv'\n",
"class_df = (pandas.read_table(class_url)\n",
" .merge(pandas.read_table(class_rel_url))\n",
" .rename(columns={'drugbank_id': 'compound_id'})\n",
" [['compound_id', 'class_id', 'class_name', 'class_source', 'class_type']]\n",
")\n",
"\n",
"class_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"92 compounds have at least 1 pharmacologic class\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" class_name | \n",
" class_source | \n",
" class_type | \n",
" count | \n",
" compounds | \n",
"
\n",
" \n",
" \n",
" \n",
" 89 | \n",
" D002491 | \n",
" Central Nervous System Agents | \n",
" MeSH | \n",
" Pharmacological Action | \n",
" 76 | \n",
" Acamprosate, Acetazolamide, Adinazolam, Alpraz... | \n",
"
\n",
" \n",
" 90 | \n",
" D002492 | \n",
" Central Nervous System Depressants | \n",
" MeSH | \n",
" Pharmacological Action | \n",
" 46 | \n",
" Alprazolam, Amobarbital, Bromazepam, Butabarbi... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id class_name class_source \\\n",
"89 D002491 Central Nervous System Agents MeSH \n",
"90 D002492 Central Nervous System Depressants MeSH \n",
"\n",
" class_type count \\\n",
"89 Pharmacological Action 76 \n",
"90 Pharmacological Action 46 \n",
"\n",
" compounds \n",
"89 Acamprosate, Acetazolamide, Adinazolam, Alpraz... \n",
"90 Alprazolam, Amobarbital, Bromazepam, Butabarbi... "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = top_compounds_df.merge(class_df)\n",
"print('{} compounds have at least 1 pharmacologic class'.format(df.compound_name.nunique()))\n",
"df = df.groupby(['class_id', 'class_name', 'class_source', 'class_type']).apply(get_counts).reset_index().sort_values('count', ascending=False)\n",
"df.to_csv('data/compounds-pharmacologic-classes.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Contribution by gene groups"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read Entrez Gene\n",
"url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'\n",
"gene_df = pandas.read_table(url)\n",
"symbol_to_name = dict(zip(gene_df.Symbol, gene_df.description))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Split on comma not in parenthesis. http://stackoverflow.com/a/26634150/4651668\n",
"gene_split = re.compile(r',\\s*(?![^()]*\\))')\n",
"\n",
"def summarize(df):\n",
" s = pandas.Series()\n",
" s['paths'] = sum(df.paths)\n",
" s['contribution'] = sum(df.contribution)\n",
" s['gene_symbols'] = ', '.join(df.gene_symbol)\n",
" return s\n",
"\n",
"def contributions_by_gene(df, edge_column):\n",
" df['gene_symbol'] = df[edge_column].map(lambda x: x.split('—')[-1])\n",
" df['gene_name'] = df['gene_symbol'].map(symbol_to_name)\n",
" df['gene_main_name'] = df.gene_name.map(lambda x: gene_split.split(x, 1)[0])\n",
" return df.groupby('gene_main_name').apply(summarize).reset_index().sort_values('contribution', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gene_main_name | \n",
" paths | \n",
" contribution | \n",
" gene_symbols | \n",
"
\n",
" \n",
" \n",
" \n",
" 74 | \n",
" gamma-aminobutyric acid (GABA) A receptor | \n",
" 91967.0 | \n",
" 15.329433 | \n",
" GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,... | \n",
"
\n",
" \n",
" 66 | \n",
" cytochrome P450 | \n",
" 34323.0 | \n",
" 5.585686 | \n",
" CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" gene_main_name paths contribution \\\n",
"74 gamma-aminobutyric acid (GABA) A receptor 91967.0 15.329433 \n",
"66 cytochrome P450 34323.0 5.585686 \n",
"\n",
" gene_symbols \n",
"74 GABRA1, GABRG2, GABRB2, GABRA5, GABRB3, GABRD,... \n",
"66 CYP2C19, CYP3A4, CYP2E1, CYP2B6, CYP2C8, CYP2C... "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source_bind_df = source_df[source_df.source_edge.str.startswith('Compound—binds—')].copy()\n",
"source_bind_df = contributions_by_gene(source_bind_df, 'source_edge')\n",
"source_bind_df.to_csv('data/source-edge-binds-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"source_bind_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gene_main_name | \n",
" paths | \n",
" contribution | \n",
" gene_symbols | \n",
"
\n",
" \n",
" \n",
" \n",
" 119 | \n",
" gamma-aminobutyric acid (GABA) A receptor | \n",
" 23347.0 | \n",
" 6.834028 | \n",
" GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD | \n",
"
\n",
" \n",
" 128 | \n",
" glutamate receptor | \n",
" 20142.0 | \n",
" 2.284863 | \n",
" GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" gene_main_name paths contribution \\\n",
"119 gamma-aminobutyric acid (GABA) A receptor 23347.0 6.834028 \n",
"128 glutamate receptor 20142.0 2.284863 \n",
"\n",
" gene_symbols \n",
"119 GABRG2, GABRA1, GABRA5, GABRB3, GABRB2, GABRD \n",
"128 GRIA2, GRIK2, GRIN2A, GRIN2B, GRM5, GRIA1, GRI... "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_bind_df = target_df[target_df.target_edge.str.contains('—associates—')].copy()\n",
"target_bind_df = contributions_by_gene(target_bind_df, 'target_edge')\n",
"target_bind_df.to_csv('data/target-edge-associates-contributions.tsv', sep='\\t', index=False, float_format='%.5g')\n",
"target_bind_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Contribution by Side Effect (source edges)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source_edge | \n",
" paths | \n",
" contribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 390 | \n",
" Compound—causes—Ataxia | \n",
" 1312.0 | \n",
" 0.069241 | \n",
"
\n",
" \n",
" 1057 | \n",
" Compound—causes—Nystagmus | \n",
" 611.0 | \n",
" 0.048500 | \n",
"
\n",
" \n",
" 579 | \n",
" Compound—causes—Diplopia | \n",
" 948.0 | \n",
" 0.044986 | \n",
"
\n",
" \n",
" 1278 | \n",
" Compound—causes—Somnolence | \n",
" 1577.0 | \n",
" 0.043543 | \n",
"
\n",
" \n",
" 1416 | \n",
" Compound—causes—Vomiting | \n",
" 1777.0 | \n",
" 0.042753 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source_edge paths contribution\n",
"390 Compound—causes—Ataxia 1312.0 0.069241\n",
"1057 Compound—causes—Nystagmus 611.0 0.048500\n",
"579 Compound—causes—Diplopia 948.0 0.044986\n",
"1278 Compound—causes—Somnolence 1577.0 0.043543\n",
"1416 Compound—causes—Vomiting 1777.0 0.042753"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"side_effect_df = source_df[source_df.source_edge.str.contains('Compound—causes—')]\n",
"side_effect_df.head()"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}