{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import gzip\n",
    "import collections\n",
    "import re\n",
    "import io\n",
    "\n",
    "import xml.etree.ElementTree as ET\n",
    "\n",
    "import requests\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xml_path = os.path.join('download', 'drugbank.xml.gz')\n",
    "with gzip.open(xml_path) as xml_file:\n",
    "    tree = ET.parse(xml_file)\n",
    "root = tree.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ns = '{http://www.drugbank.ca}'\n",
    "inchikey_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value\"\n",
    "inchi_template = \"{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value\"\n",
    "\n",
    "rows = list()\n",
    "for i, drug in enumerate(root):\n",
    "    row = collections.OrderedDict()\n",
    "    assert drug.tag == ns + 'drug'\n",
    "    row['type'] = drug.get('type')\n",
    "    row['drugbank_id'] = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n",
    "    row['name'] = drug.findtext(ns + \"name\")\n",
    "    row['groups'] = [group.text for group in\n",
    "        drug.findall(\"{ns}groups/{ns}group\".format(ns = ns))]\n",
    "    row['atc_codes'] = [code.get('code') for code in\n",
    "        drug.findall(\"{ns}atc-codes/{ns}atc-code\".format(ns = ns))]\n",
    "    row['categories'] = [x.findtext(ns + 'category') for x in\n",
    "        drug.findall(\"{ns}categories/{ns}category\".format(ns = ns))]\n",
    "    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))\n",
    "    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))\n",
    "    rows.append(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def collapse_list_values(row):\n",
    "    for key, value in row.items():\n",
    "        if isinstance(value, list):\n",
    "            row[key] = '|'.join(value)\n",
    "    return row\n",
    "\n",
    "rows = list(map(collapse_list_values, rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>groups</th>\n",
       "      <th>atc_codes</th>\n",
       "      <th>categories</th>\n",
       "      <th>inchikey</th>\n",
       "      <th>inchi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DB00001</td>\n",
       "      <td>Lepirudin</td>\n",
       "      <td>biotech</td>\n",
       "      <td>approved</td>\n",
       "      <td>B01AE02</td>\n",
       "      <td>Antithrombins|Fibrinolytic Agents</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DB00002</td>\n",
       "      <td>Cetuximab</td>\n",
       "      <td>biotech</td>\n",
       "      <td>approved</td>\n",
       "      <td>L01XC06</td>\n",
       "      <td>Antineoplastic Agents</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DB00003</td>\n",
       "      <td>Dornase alfa</td>\n",
       "      <td>biotech</td>\n",
       "      <td>approved</td>\n",
       "      <td>R05CB13</td>\n",
       "      <td>Enzymes</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DB00004</td>\n",
       "      <td>Denileukin diftitox</td>\n",
       "      <td>biotech</td>\n",
       "      <td>approved|investigational</td>\n",
       "      <td>L01XX29</td>\n",
       "      <td>Antineoplastic Agents</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DB00005</td>\n",
       "      <td>Etanercept</td>\n",
       "      <td>biotech</td>\n",
       "      <td>approved|investigational</td>\n",
       "      <td>L04AB01</td>\n",
       "      <td>Immunosuppressive Agents</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  drugbank_id                 name     type                    groups  \\\n",
       "0     DB00001            Lepirudin  biotech                  approved   \n",
       "1     DB00002            Cetuximab  biotech                  approved   \n",
       "2     DB00003         Dornase alfa  biotech                  approved   \n",
       "3     DB00004  Denileukin diftitox  biotech  approved|investigational   \n",
       "4     DB00005           Etanercept  biotech  approved|investigational   \n",
       "\n",
       "  atc_codes                         categories inchikey inchi  \n",
       "0   B01AE02  Antithrombins|Fibrinolytic Agents     None  None  \n",
       "1   L01XC06              Antineoplastic Agents     None  None  \n",
       "2   R05CB13                            Enzymes     None  None  \n",
       "3   L01XX29              Antineoplastic Agents     None  None  \n",
       "4   L04AB01           Immunosuppressive Agents     None  None  "
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi']\n",
    "drugbank_df = pandas.DataFrame.from_dict(rows)[columns]\n",
    "drugbank_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>groups</th>\n",
       "      <th>atc_codes</th>\n",
       "      <th>categories</th>\n",
       "      <th>inchikey</th>\n",
       "      <th>inchi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>DB00014</td>\n",
       "      <td>Goserelin</td>\n",
       "      <td>small molecule</td>\n",
       "      <td>approved</td>\n",
       "      <td>L02AE03</td>\n",
       "      <td></td>\n",
       "      <td>InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N</td>\n",
       "      <td>InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>DB00035</td>\n",
       "      <td>Desmopressin</td>\n",
       "      <td>small molecule</td>\n",
       "      <td>approved</td>\n",
       "      <td>H01BA02</td>\n",
       "      <td>Antidiuretic Agents|Hemostatics|Renal Agents</td>\n",
       "      <td>InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N</td>\n",
       "      <td>InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>DB00050</td>\n",
       "      <td>Cetrorelix</td>\n",
       "      <td>small molecule</td>\n",
       "      <td>approved|investigational</td>\n",
       "      <td>H01CC02</td>\n",
       "      <td>Hormone Antagonists|Fertility Agents</td>\n",
       "      <td>InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N</td>\n",
       "      <td>InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>DB00091</td>\n",
       "      <td>Cyclosporine</td>\n",
       "      <td>small molecule</td>\n",
       "      <td>approved|investigational</td>\n",
       "      <td>L04AD01|S01XA18</td>\n",
       "      <td>Antirheumatic Agents|Dermatologic Agents|Immun...</td>\n",
       "      <td>InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N</td>\n",
       "      <td>InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>DB00093</td>\n",
       "      <td>Felypressin</td>\n",
       "      <td>small molecule</td>\n",
       "      <td>approved</td>\n",
       "      <td></td>\n",
       "      <td>Vasoconstrictor Agents|Renal Agents</td>\n",
       "      <td>InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N</td>\n",
       "      <td>InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   drugbank_id          name            type                    groups  \\\n",
       "13     DB00014     Goserelin  small molecule                  approved   \n",
       "34     DB00035  Desmopressin  small molecule                  approved   \n",
       "48     DB00050    Cetrorelix  small molecule  approved|investigational   \n",
       "86     DB00091  Cyclosporine  small molecule  approved|investigational   \n",
       "88     DB00093   Felypressin  small molecule                  approved   \n",
       "\n",
       "          atc_codes                                         categories  \\\n",
       "13          L02AE03                                                      \n",
       "34          H01BA02       Antidiuretic Agents|Hemostatics|Renal Agents   \n",
       "48          H01CC02               Hormone Antagonists|Fertility Agents   \n",
       "86  L04AD01|S01XA18  Antirheumatic Agents|Dermatologic Agents|Immun...   \n",
       "88                                 Vasoconstrictor Agents|Renal Agents   \n",
       "\n",
       "                                inchikey  \\\n",
       "13  InChIKey=BLCLNMBMMGCOAS-URPVMXJPSA-N   \n",
       "34  InChIKey=NFLWUMRGJYTJIN-NXBWRCJVSA-N   \n",
       "48  InChIKey=SBNPWPIBESPSIF-MHWMIDJBSA-N   \n",
       "86  InChIKey=PMATZTZNYRCHOR-IMVLJIQENA-N   \n",
       "88  InChIKey=SFKQVVDKFKYTNA-YVGXZPIDNA-N   \n",
       "\n",
       "                                                inchi  \n",
       "13  InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...  \n",
       "34  InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...  \n",
       "48  InChI=1S/C70H92ClN17O14/c1-39(2)31-52(61(94)82...  \n",
       "86  InChI=1/C62H111N11O12/c1-25-27-28-40(15)52(75)...  \n",
       "88  InChI=1/C46H65N13O11S2/c47-18-8-7-14-29(40(64)...  "
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drugbank_slim_df = drugbank_df[\n",
    "    drugbank_df.groups.map(lambda x: 'approved' in x) &\n",
    "    drugbank_df.inchi.map(lambda x: x is not None) &\n",
    "    drugbank_df.type.map(lambda x: x == 'small molecule')\n",
    "]\n",
    "drugbank_slim_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# write drugbank tsv\n",
    "path = os.path.join('data', 'drugbank.tsv')\n",
    "drugbank_df.to_csv(path, sep='\\t', index=False)\n",
    "\n",
    "# write slim drugbank tsv\n",
    "path = os.path.join('data', 'drugbank-slim.tsv')\n",
    "drugbank_slim_df.to_csv(path, sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "protein_rows = list()\n",
    "for i, drug in enumerate(root):\n",
    "    drugbank_id = drug.findtext(ns + \"drugbank-id[@primary='true']\")\n",
    "    for category in ['target', 'enzyme', 'carrier', 'transporter']:\n",
    "        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))\n",
    "        for protein in proteins:\n",
    "            row = {'drugbank_id': drugbank_id, 'category': category}\n",
    "            row['organism'] = protein.findtext('{}organism'.format(ns))\n",
    "            row['known_action'] = protein.findtext('{}known-action'.format(ns))\n",
    "            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))\n",
    "            row['actions'] = '|'.join(action.text for action in actions)\n",
    "            uniprot_ids = [polypep.text for polypep in protein.findall(\n",
    "                \"{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier\".format(ns=ns))]            \n",
    "            if len(uniprot_ids) != 1: continue\n",
    "            row['uniprot_id'] = uniprot_ids[0]\n",
    "            ref_text = protein.findtext(\"{ns}references[@format='textile']\".format(ns=ns))\n",
    "            pmids = re.findall(r'pubmed/([0-9]+)', ref_text)\n",
    "            row['pubmed_ids'] = '|'.join(pmids)\n",
    "            protein_rows.append(row)\n",
    "\n",
    "protein_df = pandas.DataFrame.from_dict(protein_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read our uniprot to entrez_gene mapping\n",
    "response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)\n",
    "text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))\n",
    "uniprot_df = pandas.read_table(text, engine='python')\n",
    "uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)\n",
    "\n",
    "# merge uniprot mapping with protein_df\n",
    "entrez_df = protein_df.merge(uniprot_df, how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "columns = ['drugbank_id', 'category', 'uniprot_id', 'entrez_gene_id', 'organism',\n",
    "           'known_action', 'actions', 'pubmed_ids']\n",
    "entrez_df = entrez_df[columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = os.path.join('data', 'proteins.tsv')\n",
    "entrez_df.to_csv(path, sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}