{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module." ] }, { "cell_type": "code", "execution_count": 210, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import csv\n", "import gzip\n", "import collections\n", "import re\n", "import io\n", "\n", "import xml.etree.ElementTree as ET\n", "\n", "import requests\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "xml_path = os.path.join('download', 'drugbank.xml.gz')\n", "with gzip.open(xml_path) as xml_file:\n", " tree = ET.parse(xml_file)\n", "root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ns = '{http://www.drugbank.ca}'\n", "inchikey_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value\"\n", "inchi_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value\"\n", "\n", "rows = list()\n", "for i, drug in enumerate(root):\n", " row = collections.OrderedDict()\n", " assert drug.tag == ns + 'drug'\n", " row['type'] = drug.get('type')\n", " row['drugbank_id'] = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n", " row['name'] = drug.findtext(ns + \"name\")\n", " row['groups'] = [group.text for group in\n", " drug.findall(\"{ns}groups/{ns}group\".format(ns = ns))]\n", " row['atc_codes'] = [code.get('code') for code in\n", " drug.findall(\"{ns}atc-codes/{ns}atc-code\".format(ns = ns))]\n", " row['categories'] = [x.findtext(ns + 'category') for x in\n", " drug.findall(\"{ns}categories/{ns}category\".format(ns = ns))]\n", " row['inchi'] = drug.findtext(inchi_template.format(ns = ns))\n", " row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))\n", " rows.append(row)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def collapse_list_values(row):\n", " for key, value in row.items():\n", " if isinstance(value, list):\n", " row[key] = '|'.join(value)\n", " return row\n", "\n", "rows = list(map(collapse_list_values, rows))" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_idnametypegroupsatc_codescategoriesinchikeyinchi
0DB00001LepirudinbiotechapprovedB01AE02Antithrombins|Fibrinolytic AgentsNoneNone
1DB00002CetuximabbiotechapprovedL01XC06Antineoplastic AgentsNoneNone
2DB00003Dornase alfabiotechapprovedR05CB13EnzymesNoneNone
3DB00004Denileukin diftitoxbiotechapproved|investigationalL01XX29Antineoplastic AgentsNoneNone
4DB00005Etanerceptbiotechapproved|investigationalL04AB01Immunosuppressive AgentsNoneNone
\n", "
" ], "text/plain": [ " drugbank_id name type groups \\\n", "0 DB00001 Lepirudin biotech approved \n", "1 DB00002 Cetuximab biotech approved \n", "2 DB00003 Dornase alfa biotech approved \n", "3 DB00004 Denileukin diftitox biotech approved|investigational \n", "4 DB00005 Etanercept biotech approved|investigational \n", "\n", " atc_codes categories inchikey inchi \n", "0 B01AE02 Antithrombins|Fibrinolytic Agents None None \n", "1 L01XC06 Antineoplastic Agents None None \n", "2 R05CB13 Enzymes None None \n", "3 L01XX29 Antineoplastic Agents None None \n", "4 L04AB01 Immunosuppressive Agents None None " ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi']\n", "drugbank_df = pandas.DataFrame.from_dict(rows)[columns]\n", "drugbank_df.head()" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_idnametypegroupsatc_codescategoriesinchikeyinchi
13DB00014Goserelinsmall moleculeapprovedL02AE03InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-NInChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...
34DB00035Desmopressinsmall moleculeapprovedH01BA02Antidiuretic Agents|Hemostatics|Renal AgentsInChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-NInChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...
48DB00050Cetrorelixsmall moleculeapproved|investigationalH01CC02Hormone Antagonists|Fertility AgentsInChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-NInChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82...
86DB00091Cyclosporinesmall moleculeapproved|investigationalL04AD01|S01XA18Antirheumatic Agents|Dermatologic Agents|Immun...InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-NInChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)...
88DB00093Felypressinsmall moleculeapprovedVasoconstrictor Agents|Renal AgentsInChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-NInChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)...
\n", "
" ], "text/plain": [ " drugbank_id name type groups \\\n", "13 DB00014 Goserelin small molecule approved \n", "34 DB00035 Desmopressin small molecule approved \n", "48 DB00050 Cetrorelix small molecule approved|investigational \n", "86 DB00091 Cyclosporine small molecule approved|investigational \n", "88 DB00093 Felypressin small molecule approved \n", "\n", " atc_codes categories \\\n", "13 L02AE03 \n", "34 H01BA02 Antidiuretic Agents|Hemostatics|Renal Agents \n", "48 H01CC02 Hormone Antagonists|Fertility Agents \n", "86 L04AD01|S01XA18 Antirheumatic Agents|Dermatologic Agents|Immun... \n", "88 Vasoconstrictor Agents|Renal Agents \n", "\n", " inchikey \\\n", "13 InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N \n", "34 InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N \n", "48 InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N \n", "86 InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N \n", "88 InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N \n", "\n", " inchi \n", "13 InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3... \n", "34 InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(... \n", "48 InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82... \n", "86 InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)... \n", "88 InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)... " ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drugbank_slim_df = drugbank_df[\n", " drugbank_df.groups.map(lambda x: 'approved' in x) &\n", " drugbank_df.inchi.map(lambda x: x is not None) &\n", " drugbank_df.type.map(lambda x: x == 'small molecule')\n", "]\n", "drugbank_slim_df.head()" ] }, { "cell_type": "code", "execution_count": 124, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# write drugbank tsv\n", "path = os.path.join('data', 'drugbank.tsv')\n", "drugbank_df.to_csv(path, sep='\\t', index=False)\n", "\n", "# write slim drugbank tsv\n", "path = os.path.join('data', 'drugbank-slim.tsv')\n", "drugbank_slim_df.to_csv(path, sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 238, "metadata": { "collapsed": false }, "outputs": [], "source": [ "protein_rows = list()\n", "for i, drug in enumerate(root):\n", " drugbank_id = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n", " for category in ['target', 'enzyme', 'carrier', 'transporter']:\n", " proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))\n", " for protein in proteins:\n", " row = {'drugbank_id': drugbank_id, 'category': category}\n", " row['organism'] = protein.findtext('{}organism'.format(ns))\n", " row['known_action'] = protein.findtext('{}known-action'.format(ns))\n", " actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))\n", " row['actions'] = '|'.join(action.text for action in actions)\n", " uniprot_ids = [polypep.text for polypep in protein.findall(\n", " \"{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier\".format(ns=ns))] \n", " if len(uniprot_ids) != 1: continue\n", " row['uniprot_id'] = uniprot_ids[0]\n", " ref_text = protein.findtext(\"{ns}references[@format='textile']\".format(ns=ns))\n", " pmids = re.findall(r'pubmed/([0-9]+)', ref_text)\n", " row['pubmed_ids'] = '|'.join(pmids)\n", " protein_rows.append(row)\n", "\n", "protein_df = pandas.DataFrame.from_dict(protein_rows)" ] }, { "cell_type": "code", "execution_count": 239, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read our uniprot to entrez_gene mapping\n", "response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)\n", "text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))\n", "uniprot_df = pandas.read_table(text, engine='python')\n", "uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)\n", "\n", "# merge uniprot mapping with protein_df\n", "entrez_df = protein_df.merge(uniprot_df, how='inner')" ] }, { "cell_type": "code", "execution_count": 240, "metadata": { "collapsed": false }, "outputs": [], "source": [ "columns = ['drugbank_id', 'category', 'uniprot_id', 'entrez_gene_id', 'organism',\n", " 'known_action', 'actions', 'pubmed_ids']\n", "entrez_df = entrez_df[columns]" ] }, { "cell_type": "code", "execution_count": 241, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = os.path.join('data', 'proteins.tsv')\n", "entrez_df.to_csv(path, sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }