{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module."
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import csv\n",
"import gzip\n",
"import collections\n",
"import re\n",
"import io\n",
"\n",
"import xml.etree.ElementTree as ET\n",
"\n",
"import requests\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"xml_path = os.path.join('download', 'drugbank.xml.gz')\n",
"with gzip.open(xml_path) as xml_file:\n",
" tree = ET.parse(xml_file)\n",
"root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ns = '{http://www.drugbank.ca}'\n",
"inchikey_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value\"\n",
"inchi_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value\"\n",
"\n",
"rows = list()\n",
"for i, drug in enumerate(root):\n",
" row = collections.OrderedDict()\n",
" assert drug.tag == ns + 'drug'\n",
" row['type'] = drug.get('type')\n",
" row['drugbank_id'] = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n",
" row['name'] = drug.findtext(ns + \"name\")\n",
" row['groups'] = [group.text for group in\n",
" drug.findall(\"{ns}groups/{ns}group\".format(ns = ns))]\n",
" row['atc_codes'] = [code.get('code') for code in\n",
" drug.findall(\"{ns}atc-codes/{ns}atc-code\".format(ns = ns))]\n",
" row['categories'] = [x.findtext(ns + 'category') for x in\n",
" drug.findall(\"{ns}categories/{ns}category\".format(ns = ns))]\n",
" row['inchi'] = drug.findtext(inchi_template.format(ns = ns))\n",
" row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))\n",
" rows.append(row)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def collapse_list_values(row):\n",
" for key, value in row.items():\n",
" if isinstance(value, list):\n",
" row[key] = '|'.join(value)\n",
" return row\n",
"\n",
"rows = list(map(collapse_list_values, rows))"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" name | \n",
" type | \n",
" groups | \n",
" atc_codes | \n",
" categories | \n",
" inchikey | \n",
" inchi | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DB00001 | \n",
" Lepirudin | \n",
" biotech | \n",
" approved | \n",
" B01AE02 | \n",
" Antithrombins|Fibrinolytic Agents | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" DB00002 | \n",
" Cetuximab | \n",
" biotech | \n",
" approved | \n",
" L01XC06 | \n",
" Antineoplastic Agents | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" DB00003 | \n",
" Dornase alfa | \n",
" biotech | \n",
" approved | \n",
" R05CB13 | \n",
" Enzymes | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" DB00004 | \n",
" Denileukin diftitox | \n",
" biotech | \n",
" approved|investigational | \n",
" L01XX29 | \n",
" Antineoplastic Agents | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 4 | \n",
" DB00005 | \n",
" Etanercept | \n",
" biotech | \n",
" approved|investigational | \n",
" L04AB01 | \n",
" Immunosuppressive Agents | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id name type groups \\\n",
"0 DB00001 Lepirudin biotech approved \n",
"1 DB00002 Cetuximab biotech approved \n",
"2 DB00003 Dornase alfa biotech approved \n",
"3 DB00004 Denileukin diftitox biotech approved|investigational \n",
"4 DB00005 Etanercept biotech approved|investigational \n",
"\n",
" atc_codes categories inchikey inchi \n",
"0 B01AE02 Antithrombins|Fibrinolytic Agents None None \n",
"1 L01XC06 Antineoplastic Agents None None \n",
"2 R05CB13 Enzymes None None \n",
"3 L01XX29 Antineoplastic Agents None None \n",
"4 L04AB01 Immunosuppressive Agents None None "
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi']\n",
"drugbank_df = pandas.DataFrame.from_dict(rows)[columns]\n",
"drugbank_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" name | \n",
" type | \n",
" groups | \n",
" atc_codes | \n",
" categories | \n",
" inchikey | \n",
" inchi | \n",
"
\n",
" \n",
" \n",
" \n",
" 13 | \n",
" DB00014 | \n",
" Goserelin | \n",
" small molecule | \n",
" approved | \n",
" L02AE03 | \n",
" | \n",
" InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N | \n",
" InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3... | \n",
"
\n",
" \n",
" 34 | \n",
" DB00035 | \n",
" Desmopressin | \n",
" small molecule | \n",
" approved | \n",
" H01BA02 | \n",
" Antidiuretic Agents|Hemostatics|Renal Agents | \n",
" InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N | \n",
" InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(... | \n",
"
\n",
" \n",
" 48 | \n",
" DB00050 | \n",
" Cetrorelix | \n",
" small molecule | \n",
" approved|investigational | \n",
" H01CC02 | \n",
" Hormone Antagonists|Fertility Agents | \n",
" InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N | \n",
" InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82... | \n",
"
\n",
" \n",
" 86 | \n",
" DB00091 | \n",
" Cyclosporine | \n",
" small molecule | \n",
" approved|investigational | \n",
" L04AD01|S01XA18 | \n",
" Antirheumatic Agents|Dermatologic Agents|Immun... | \n",
" InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N | \n",
" InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)... | \n",
"
\n",
" \n",
" 88 | \n",
" DB00093 | \n",
" Felypressin | \n",
" small molecule | \n",
" approved | \n",
" | \n",
" Vasoconstrictor Agents|Renal Agents | \n",
" InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N | \n",
" InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id name type groups \\\n",
"13 DB00014 Goserelin small molecule approved \n",
"34 DB00035 Desmopressin small molecule approved \n",
"48 DB00050 Cetrorelix small molecule approved|investigational \n",
"86 DB00091 Cyclosporine small molecule approved|investigational \n",
"88 DB00093 Felypressin small molecule approved \n",
"\n",
" atc_codes categories \\\n",
"13 L02AE03 \n",
"34 H01BA02 Antidiuretic Agents|Hemostatics|Renal Agents \n",
"48 H01CC02 Hormone Antagonists|Fertility Agents \n",
"86 L04AD01|S01XA18 Antirheumatic Agents|Dermatologic Agents|Immun... \n",
"88 Vasoconstrictor Agents|Renal Agents \n",
"\n",
" inchikey \\\n",
"13 InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N \n",
"34 InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N \n",
"48 InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N \n",
"86 InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N \n",
"88 InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N \n",
"\n",
" inchi \n",
"13 InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3... \n",
"34 InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(... \n",
"48 InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82... \n",
"86 InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)... \n",
"88 InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)... "
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drugbank_slim_df = drugbank_df[\n",
" drugbank_df.groups.map(lambda x: 'approved' in x) &\n",
" drugbank_df.inchi.map(lambda x: x is not None) &\n",
" drugbank_df.type.map(lambda x: x == 'small molecule')\n",
"]\n",
"drugbank_slim_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# write drugbank tsv\n",
"path = os.path.join('data', 'drugbank.tsv')\n",
"drugbank_df.to_csv(path, sep='\\t', index=False)\n",
"\n",
"# write slim drugbank tsv\n",
"path = os.path.join('data', 'drugbank-slim.tsv')\n",
"drugbank_slim_df.to_csv(path, sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 238,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"protein_rows = list()\n",
"for i, drug in enumerate(root):\n",
" drugbank_id = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n",
" for category in ['target', 'enzyme', 'carrier', 'transporter']:\n",
" proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))\n",
" for protein in proteins:\n",
" row = {'drugbank_id': drugbank_id, 'category': category}\n",
" row['organism'] = protein.findtext('{}organism'.format(ns))\n",
" row['known_action'] = protein.findtext('{}known-action'.format(ns))\n",
" actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))\n",
" row['actions'] = '|'.join(action.text for action in actions)\n",
" uniprot_ids = [polypep.text for polypep in protein.findall(\n",
" \"{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier\".format(ns=ns))] \n",
" if len(uniprot_ids) != 1: continue\n",
" row['uniprot_id'] = uniprot_ids[0]\n",
" ref_text = protein.findtext(\"{ns}references[@format='textile']\".format(ns=ns))\n",
" pmids = re.findall(r'pubmed/([0-9]+)', ref_text)\n",
" row['pubmed_ids'] = '|'.join(pmids)\n",
" protein_rows.append(row)\n",
"\n",
"protein_df = pandas.DataFrame.from_dict(protein_rows)"
]
},
{
"cell_type": "code",
"execution_count": 239,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read our uniprot to entrez_gene mapping\n",
"response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)\n",
"text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))\n",
"uniprot_df = pandas.read_table(text, engine='python')\n",
"uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)\n",
"\n",
"# merge uniprot mapping with protein_df\n",
"entrez_df = protein_df.merge(uniprot_df, how='inner')"
]
},
{
"cell_type": "code",
"execution_count": 240,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"columns = ['drugbank_id', 'category', 'uniprot_id', 'entrez_gene_id', 'organism',\n",
" 'known_action', 'actions', 'pubmed_ids']\n",
"entrez_df = entrez_df[columns]"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"path = os.path.join('data', 'proteins.tsv')\n",
"entrez_df.to_csv(path, sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}