{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Convert DrugCentral relationships to Rephetio identifiers"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import urllib\n",
"import json\n",
"\n",
"import pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read DO Slim - the disease subset used for rephetio"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" slim_id | \n",
" slim_name | \n",
" subsumed_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:0050156 | \n",
" idiopathic pulmonary fibrosis | \n",
" DOID:0050156 | \n",
"
\n",
" \n",
" 1 | \n",
" DOID:0050425 | \n",
" restless legs syndrome | \n",
" DOID:0050425 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" slim_id slim_name subsumed_id\n",
"0 DOID:0050156 idiopathic pulmonary fibrosis DOID:0050156\n",
"1 DOID:0050425 restless legs syndrome DOID:0050425"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'\n",
"do_slim = pandas.read_table(url)\n",
"do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]\n",
"do_slim.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read UniProt to Entrez Gene mapping"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" uniprot | \n",
" GeneID | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A0A010PZJ8 | \n",
" 19039206 | \n",
"
\n",
" \n",
" 1 | \n",
" A0A010PZK3 | \n",
" 19039211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" uniprot GeneID\n",
"0 A0A010PZJ8 19039206\n",
"1 A0A010PZK3 19039211"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'\n",
"entrez_map_df = pandas.read_table(url, compression='gzip')\n",
"entrez_map_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read DrugBank Slim"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" drugbank_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DB00014 | \n",
" Goserelin | \n",
"
\n",
" \n",
" 1 | \n",
" DB00035 | \n",
" Desmopressin | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id drugbank_name\n",
"0 DB00014 Goserelin\n",
"1 DB00035 Desmopressin"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'\n",
"drugbank_df = pandas.read_table(url)\n",
"drugbank_df = drugbank_df[['drugbank_id', 'name']]\n",
"drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})\n",
"drugbank_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1552"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(drugbank_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read identifiers"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" DRUG_ID | \n",
" drugbank_id | \n",
" drugbank_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1327 | \n",
" DB00014 | \n",
" Goserelin | \n",
"
\n",
" \n",
" 1 | \n",
" 817 | \n",
" DB00035 | \n",
" Desmopressin | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" DRUG_ID drugbank_id drugbank_name\n",
"0 1327 DB00014 Goserelin\n",
"1 817 DB00035 Desmopressin"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = 'drugtarget/identifiers.tsv'\n",
"id_df = pandas.read_table(path)\n",
"id_df = id_df.query(\"ID_TYPE == 'DRUGBANK_ID'\")[['DRUG_ID', 'IDENTIFIER']]\n",
"id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})\n",
"drugbank_df = id_df.merge(drugbank_df)\n",
"drugbank_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1634"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(drugbank_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert drug targets"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GeneID | \n",
" drugbank_id | \n",
" drugbank_name | \n",
" TARGET_NAME | \n",
" TARGET_FAMILY | \n",
" SOURCE | \n",
" REFERENCE | \n",
" action | \n",
" pubmed_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 8233868 | \n",
" DB00431 | \n",
" Lindane | \n",
" GABA-A receptor | \n",
" Ion channel | \n",
" CHEMBL | \n",
" https://www.ebi.ac.uk/chembl/compound/inspect/... | \n",
" negative allosteric modulator | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 8232849 | \n",
" DB08823 | \n",
" Spinosad | \n",
" Nicotinic acetylcholine receptor | \n",
" Ion channel | \n",
" CHEMBL | \n",
" https://www.ebi.ac.uk/chembl/compound/inspect/... | \n",
" agonist | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GeneID drugbank_id drugbank_name TARGET_NAME \\\n",
"0 8233868 DB00431 Lindane GABA-A receptor \n",
"1 8232849 DB08823 Spinosad Nicotinic acetylcholine receptor \n",
"\n",
" TARGET_FAMILY SOURCE REFERENCE \\\n",
"0 Ion channel CHEMBL https://www.ebi.ac.uk/chembl/compound/inspect/... \n",
"1 Ion channel CHEMBL https://www.ebi.ac.uk/chembl/compound/inspect/... \n",
"\n",
" action pubmed_id \n",
"0 negative allosteric modulator NaN \n",
"1 agonist NaN "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = 'drugtarget/drug_target.tsv'\n",
"target_df = pandas.read_table(path)\n",
"target_df = drugbank_df.merge(target_df)\n",
"target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]\n",
"\n",
"# Split multi-protein targets into many rows\n",
"s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()\n",
"s.index = s.index.droplevel(-1)\n",
"s.name ='uniprot'\n",
"del target_df['UNIPROT']\n",
"target_df = target_df.join(s)\n",
"\n",
"target_df = entrez_map_df.merge(target_df)\n",
"del target_df['uniprot']\n",
"\n",
"target_df['action'] = target_df['ACTION_TYPE'].str.lower()\n",
"del target_df['ACTION_TYPE']\n",
"\n",
"target_df['pubmed_id'] = target_df.REFERENCE.str.extract('pubmed/([0-9]+)')\n",
"\n",
"target_df = target_df.drop_duplicates()\n",
"target_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"DrugCentral (ChEMBL) 2922\n",
"DrugCentral (literature) 182\n",
"DrugCentral (label) 89\n",
"DrugCentral (IUPHAR) 56\n",
"DrugCentral (KEGG DRUG) 25\n",
"Name: SOURCE, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_source_map = {\n",
" 'CHEMBL': 'DrugCentral (ChEMBL)',\n",
" 'SCIENTIFIC LITERATURE': 'DrugCentral (literature)',\n",
" 'DRUG LABEL': 'DrugCentral (label)',\n",
" 'IUPHAR': 'DrugCentral (IUPHAR)',\n",
" 'KEGG DRUG': 'DrugCentral (KEGG DRUG)',\n",
"}\n",
"target_df.SOURCE = target_df.SOURCE.map(target_source_map)\n",
"target_df.SOURCE.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"def condense_targets(df):\n",
" \"\"\"Condense drug-target relationships.\"\"\"\n",
" row = pandas.Series()\n",
" row['pubmed_ids'] = '|'.join(sorted(df.pubmed_id.dropna().unique()))\n",
" row['sources'] = '|'.join(sorted(df.SOURCE.unique()))\n",
" row['actions'] = '|'.join(sorted(df.action.unique()))\n",
" row['urls'] = '|'.join(sorted(url for url in df.REFERENCE.unique() if not 'pubmed' in url))\n",
" return row\n",
" \n",
"target_df = target_df.groupby(['GeneID', 'drugbank_id', 'drugbank_name']).apply(condense_targets).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"target_df.to_csv('rephetio/targets.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read and process DrugCentral Indications"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"path = 'drugtarget/drug_indication.tsv'\n",
"indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})\n",
"indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})\n",
"indication_df = do_slim.merge(drugbank_df.merge(indication_df))\n",
"del indication_df['DRUG_ID']\n",
"indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]\n",
"indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})\n",
"indication_df = indication_df.sort_values(['disease', 'drug'])\n",
"indication_df = indication_df.drop_duplicates()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compare to PharmacotherapyDB"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doid_id | \n",
" drugbank_id | \n",
" disease | \n",
" drug | \n",
" category | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:10652 | \n",
" DB00843 | \n",
" Alzheimer's disease | \n",
" Donepezil | \n",
" DM | \n",
"
\n",
" \n",
" 1 | \n",
" DOID:10652 | \n",
" DB00674 | \n",
" Alzheimer's disease | \n",
" Galantamine | \n",
" DM | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid_id drugbank_id disease drug category\n",
"0 DOID:10652 DB00843 Alzheimer's disease Donepezil DM\n",
"1 DOID:10652 DB00674 Alzheimer's disease Galantamine DM"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'\n",
"phcoth_df = pandas.read_table(url)\n",
"phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]\n",
"indication_df = indication_df.merge(phcoth_df, how='left')\n",
"indication_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"671"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(indication_df)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"DM 359\n",
"NaN 210\n",
"SYM 77\n",
"NOT 25\n",
"Name: category, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indication_df.category.value_counts(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"indication_df.to_csv('rephetio/indications.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pharmacologic class"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" drugbank_name | \n",
" class_id | \n",
" class_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DB00014 | \n",
" Goserelin | \n",
" N0000175655 | \n",
" Gonadotropin Releasing Hormone Receptor Agonist | \n",
"
\n",
" \n",
" 1 | \n",
" DB00014 | \n",
" Goserelin | \n",
" N0000175654 | \n",
" Gonadotropin Releasing Hormone Receptor Agonists | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id drugbank_name class_id \\\n",
"0 DB00014 Goserelin N0000175655 \n",
"1 DB00014 Goserelin N0000175654 \n",
"\n",
" class_name \n",
"0 Gonadotropin Releasing Hormone Receptor Agonist \n",
"1 Gonadotropin Releasing Hormone Receptor Agonists "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = 'drugtarget/pharm_class.tsv'\n",
"class_df = pandas.read_table(path)\n",
"class_df = drugbank_df.merge(class_df)\n",
"classes_df = class_df[['TYPE', 'CLASS_SOURCE_ID', 'CLASS', 'SOURCE']].drop_duplicates()\n",
"class_df = class_df[['drugbank_id', 'drugbank_name', 'CLASS_SOURCE_ID', 'CLASS']]\n",
"class_df = class_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name'})\n",
"class_df = class_df.drop_duplicates()\n",
"class_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1262"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pharmacologic mappings\n",
"len(classes_df)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10959"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Class to Drug mappings\n",
"len(class_df)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" class_name | \n",
" class_source | \n",
" class_type | \n",
" url | \n",
"
\n",
" \n",
" \n",
" \n",
" 73 | \n",
" CHEBI:21241 | \n",
" vitamin C | \n",
" CHEBI | \n",
" Application | \n",
" http://identifiers.org/chebi/CHEBI%3A21241 | \n",
"
\n",
" \n",
" 4385 | \n",
" CHEBI:22153 | \n",
" acaricide | \n",
" CHEBI | \n",
" Application | \n",
" http://identifiers.org/chebi/CHEBI%3A22153 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id class_name class_source class_type \\\n",
"73 CHEBI:21241 vitamin C CHEBI Application \n",
"4385 CHEBI:22153 acaricide CHEBI Application \n",
"\n",
" url \n",
"73 http://identifiers.org/chebi/CHEBI%3A21241 \n",
"4385 http://identifiers.org/chebi/CHEBI%3A22153 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_type_map = {\n",
" 'MoA': 'Mechanism of Action',\n",
" 'PE': 'Physiologic Effect',\n",
" 'CS': 'Chemical Structure',\n",
" 'EPC': 'FDA Established Pharmacologic Class',\n",
" 'PA': 'Pharmacological Action',\n",
" 'has role': 'Application',\n",
" 'Chemical/Ingredient': 'Chemical/Ingredient',\n",
"}\n",
"\n",
"def get_class_url(class_source, class_id):\n",
" \"\"\"Create URLs for pharmacological classes based on their source\"\"\"\n",
" class_id = urllib.parse.quote(class_id)\n",
" if class_source == 'CHEBI':\n",
" return 'http://identifiers.org/chebi/{}'.format(class_id)\n",
" if class_source == 'MeSH':\n",
" return 'http://identifiers.org/mesh/{}'.format(class_id)\n",
" if class_source == 'FDA':\n",
" #return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)\n",
" # Use bioportal link until something better arises\n",
" return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)\n",
"\n",
"classes_df['class_type'] = classes_df.TYPE.map(class_type_map)\n",
"del classes_df['TYPE']\n",
"classes_df = classes_df.sort_values(['class_type', 'CLASS_SOURCE_ID'])\n",
"classes_df = classes_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name', 'SOURCE': 'class_source'})\n",
"classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')\n",
"classes_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class_df.to_csv('rephetio/drug-to-class.tsv', sep='\\t', index=False)\n",
"classes_df.to_csv('rephetio/classes.tsv', sep='\\t', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}