{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Convert DrugCentral relationships to Rephetio identifiers" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import urllib\n", "import json\n", "\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read DO Slim - the disease subset used for rephetio" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
slim_idslim_namesubsumed_id
0DOID:0050156idiopathic pulmonary fibrosisDOID:0050156
1DOID:0050425restless legs syndromeDOID:0050425
\n", "
" ], "text/plain": [ " slim_id slim_name subsumed_id\n", "0 DOID:0050156 idiopathic pulmonary fibrosis DOID:0050156\n", "1 DOID:0050425 restless legs syndrome DOID:0050425" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'\n", "do_slim = pandas.read_table(url)\n", "do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]\n", "do_slim.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read UniProt to Entrez Gene mapping" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uniprotGeneID
0A0A010PZJ819039206
1A0A010PZK319039211
\n", "
" ], "text/plain": [ " uniprot GeneID\n", "0 A0A010PZJ8 19039206\n", "1 A0A010PZK3 19039211" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'\n", "entrez_map_df = pandas.read_table(url, compression='gzip')\n", "entrez_map_df.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read DrugBank Slim" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_iddrugbank_name
0DB00014Goserelin
1DB00035Desmopressin
\n", "
" ], "text/plain": [ " drugbank_id drugbank_name\n", "0 DB00014 Goserelin\n", "1 DB00035 Desmopressin" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'\n", "drugbank_df = pandas.read_table(url)\n", "drugbank_df = drugbank_df[['drugbank_id', 'name']]\n", "drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})\n", "drugbank_df.head(2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1552" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(drugbank_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read identifiers" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DRUG_IDdrugbank_iddrugbank_name
01327DB00014Goserelin
1817DB00035Desmopressin
\n", "
" ], "text/plain": [ " DRUG_ID drugbank_id drugbank_name\n", "0 1327 DB00014 Goserelin\n", "1 817 DB00035 Desmopressin" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = 'drugtarget/identifiers.tsv'\n", "id_df = pandas.read_table(path)\n", "id_df = id_df.query(\"ID_TYPE == 'DRUGBANK_ID'\")[['DRUG_ID', 'IDENTIFIER']]\n", "id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})\n", "drugbank_df = id_df.merge(drugbank_df)\n", "drugbank_df.head(2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1634" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(drugbank_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert drug targets" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GeneIDdrugbank_iddrugbank_nameTARGET_NAMETARGET_FAMILYSOURCEREFERENCEactionpubmed_id
08233868DB00431LindaneGABA-A receptorIon channelCHEMBLhttps://www.ebi.ac.uk/chembl/compound/inspect/...negative allosteric modulatorNaN
18232849DB08823SpinosadNicotinic acetylcholine receptorIon channelCHEMBLhttps://www.ebi.ac.uk/chembl/compound/inspect/...agonistNaN
\n", "
" ], "text/plain": [ " GeneID drugbank_id drugbank_name TARGET_NAME \\\n", "0 8233868 DB00431 Lindane GABA-A receptor \n", "1 8232849 DB08823 Spinosad Nicotinic acetylcholine receptor \n", "\n", " TARGET_FAMILY SOURCE REFERENCE \\\n", "0 Ion channel CHEMBL https://www.ebi.ac.uk/chembl/compound/inspect/... \n", "1 Ion channel CHEMBL https://www.ebi.ac.uk/chembl/compound/inspect/... \n", "\n", " action pubmed_id \n", "0 negative allosteric modulator NaN \n", "1 agonist NaN " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = 'drugtarget/drug_target.tsv'\n", "target_df = pandas.read_table(path)\n", "target_df = drugbank_df.merge(target_df)\n", "target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]\n", "\n", "# Split multi-protein targets into many rows\n", "s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()\n", "s.index = s.index.droplevel(-1)\n", "s.name ='uniprot'\n", "del target_df['UNIPROT']\n", "target_df = target_df.join(s)\n", "\n", "target_df = entrez_map_df.merge(target_df)\n", "del target_df['uniprot']\n", "\n", "target_df['action'] = target_df['ACTION_TYPE'].str.lower()\n", "del target_df['ACTION_TYPE']\n", "\n", "target_df['pubmed_id'] = target_df.REFERENCE.str.extract('pubmed/([0-9]+)')\n", "\n", "target_df = target_df.drop_duplicates()\n", "target_df.head(2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DrugCentral (ChEMBL) 2922\n", "DrugCentral (literature) 182\n", "DrugCentral (label) 89\n", "DrugCentral (IUPHAR) 56\n", "DrugCentral (KEGG DRUG) 25\n", "Name: SOURCE, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_source_map = {\n", " 'CHEMBL': 'DrugCentral (ChEMBL)',\n", " 'SCIENTIFIC LITERATURE': 'DrugCentral (literature)',\n", " 'DRUG LABEL': 'DrugCentral (label)',\n", " 'IUPHAR': 'DrugCentral (IUPHAR)',\n", " 'KEGG DRUG': 'DrugCentral (KEGG DRUG)',\n", "}\n", "target_df.SOURCE = target_df.SOURCE.map(target_source_map)\n", "target_df.SOURCE.value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "def condense_targets(df):\n", " \"\"\"Condense drug-target relationships.\"\"\"\n", " row = pandas.Series()\n", " row['pubmed_ids'] = '|'.join(sorted(df.pubmed_id.dropna().unique()))\n", " row['sources'] = '|'.join(sorted(df.SOURCE.unique()))\n", " row['actions'] = '|'.join(sorted(df.action.unique()))\n", " row['urls'] = '|'.join(sorted(url for url in df.REFERENCE.unique() if not 'pubmed' in url))\n", " return row\n", " \n", "target_df = target_df.groupby(['GeneID', 'drugbank_id', 'drugbank_name']).apply(condense_targets).reset_index()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "target_df.to_csv('rephetio/targets.tsv', sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and process DrugCentral Indications" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = 'drugtarget/drug_indication.tsv'\n", "indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})\n", "indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})\n", "indication_df = do_slim.merge(drugbank_df.merge(indication_df))\n", "del indication_df['DRUG_ID']\n", "indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]\n", "indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})\n", "indication_df = indication_df.sort_values(['disease', 'drug'])\n", "indication_df = indication_df.drop_duplicates()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Compare to PharmacotherapyDB" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doid_iddrugbank_iddiseasedrugcategory
0DOID:10652DB00843Alzheimer's diseaseDonepezilDM
1DOID:10652DB00674Alzheimer's diseaseGalantamineDM
\n", "
" ], "text/plain": [ " doid_id drugbank_id disease drug category\n", "0 DOID:10652 DB00843 Alzheimer's disease Donepezil DM\n", "1 DOID:10652 DB00674 Alzheimer's disease Galantamine DM" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'\n", "phcoth_df = pandas.read_table(url)\n", "phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]\n", "indication_df = indication_df.merge(phcoth_df, how='left')\n", "indication_df.head(2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "671" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(indication_df)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DM 359\n", "NaN 210\n", "SYM 77\n", "NOT 25\n", "Name: category, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indication_df.category.value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "indication_df.to_csv('rephetio/indications.tsv', sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pharmacologic class" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_iddrugbank_nameclass_idclass_name
0DB00014GoserelinN0000175655Gonadotropin Releasing Hormone Receptor Agonist
1DB00014GoserelinN0000175654Gonadotropin Releasing Hormone Receptor Agonists
\n", "
" ], "text/plain": [ " drugbank_id drugbank_name class_id \\\n", "0 DB00014 Goserelin N0000175655 \n", "1 DB00014 Goserelin N0000175654 \n", "\n", " class_name \n", "0 Gonadotropin Releasing Hormone Receptor Agonist \n", "1 Gonadotropin Releasing Hormone Receptor Agonists " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = 'drugtarget/pharm_class.tsv'\n", "class_df = pandas.read_table(path)\n", "class_df = drugbank_df.merge(class_df)\n", "classes_df = class_df[['TYPE', 'CLASS_SOURCE_ID', 'CLASS', 'SOURCE']].drop_duplicates()\n", "class_df = class_df[['drugbank_id', 'drugbank_name', 'CLASS_SOURCE_ID', 'CLASS']]\n", "class_df = class_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name'})\n", "class_df = class_df.drop_duplicates()\n", "class_df.head(2)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1262" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pharmacologic mappings\n", "len(classes_df)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "10959" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Class to Drug mappings\n", "len(class_df)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
class_idclass_nameclass_sourceclass_typeurl
73CHEBI:21241vitamin CCHEBIApplicationhttp://identifiers.org/chebi/CHEBI%3A21241
4385CHEBI:22153acaricideCHEBIApplicationhttp://identifiers.org/chebi/CHEBI%3A22153
\n", "
" ], "text/plain": [ " class_id class_name class_source class_type \\\n", "73 CHEBI:21241 vitamin C CHEBI Application \n", "4385 CHEBI:22153 acaricide CHEBI Application \n", "\n", " url \n", "73 http://identifiers.org/chebi/CHEBI%3A21241 \n", "4385 http://identifiers.org/chebi/CHEBI%3A22153 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class_type_map = {\n", " 'MoA': 'Mechanism of Action',\n", " 'PE': 'Physiologic Effect',\n", " 'CS': 'Chemical Structure',\n", " 'EPC': 'FDA Established Pharmacologic Class',\n", " 'PA': 'Pharmacological Action',\n", " 'has role': 'Application',\n", " 'Chemical/Ingredient': 'Chemical/Ingredient',\n", "}\n", "\n", "def get_class_url(class_source, class_id):\n", " \"\"\"Create URLs for pharmacological classes based on their source\"\"\"\n", " class_id = urllib.parse.quote(class_id)\n", " if class_source == 'CHEBI':\n", " return 'http://identifiers.org/chebi/{}'.format(class_id)\n", " if class_source == 'MeSH':\n", " return 'http://identifiers.org/mesh/{}'.format(class_id)\n", " if class_source == 'FDA':\n", " #return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)\n", " # Use bioportal link until something better arises\n", " return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)\n", "\n", "classes_df['class_type'] = classes_df.TYPE.map(class_type_map)\n", "del classes_df['TYPE']\n", "classes_df = classes_df.sort_values(['class_type', 'CLASS_SOURCE_ID'])\n", "classes_df = classes_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name', 'SOURCE': 'class_source'})\n", "classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')\n", "classes_df.head(2)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class_df.to_csv('rephetio/drug-to-class.tsv', sep='\\t', index=False)\n", "classes_df.to_csv('rephetio/classes.tsv', sep='\\t', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }