{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert DrugCentral relationships to Rephetio identifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import urllib\n",
    "import json\n",
    "\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read DO Slim - the disease subset used for rephetio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>slim_id</th>\n",
       "      <th>slim_name</th>\n",
       "      <th>subsumed_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DOID:0050156</td>\n",
       "      <td>idiopathic pulmonary fibrosis</td>\n",
       "      <td>DOID:0050156</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DOID:0050425</td>\n",
       "      <td>restless legs syndrome</td>\n",
       "      <td>DOID:0050425</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        slim_id                      slim_name   subsumed_id\n",
       "0  DOID:0050156  idiopathic pulmonary fibrosis  DOID:0050156\n",
       "1  DOID:0050425         restless legs syndrome  DOID:0050425"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'\n",
    "do_slim = pandas.read_table(url)\n",
    "do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]\n",
    "do_slim.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read UniProt to Entrez Gene mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uniprot</th>\n",
       "      <th>GeneID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A0A010PZJ8</td>\n",
       "      <td>19039206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A0A010PZK3</td>\n",
       "      <td>19039211</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      uniprot    GeneID\n",
       "0  A0A010PZJ8  19039206\n",
       "1  A0A010PZK3  19039211"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'\n",
    "entrez_map_df = pandas.read_table(url, compression='gzip')\n",
    "entrez_map_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read DrugBank Slim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>drugbank_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DB00014</td>\n",
       "      <td>Goserelin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DB00035</td>\n",
       "      <td>Desmopressin</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  drugbank_id drugbank_name\n",
       "0     DB00014     Goserelin\n",
       "1     DB00035  Desmopressin"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'\n",
    "drugbank_df = pandas.read_table(url)\n",
    "drugbank_df = drugbank_df[['drugbank_id', 'name']]\n",
    "drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})\n",
    "drugbank_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1552"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(drugbank_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read identifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DRUG_ID</th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>drugbank_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1327</td>\n",
       "      <td>DB00014</td>\n",
       "      <td>Goserelin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>817</td>\n",
       "      <td>DB00035</td>\n",
       "      <td>Desmopressin</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   DRUG_ID drugbank_id drugbank_name\n",
       "0     1327     DB00014     Goserelin\n",
       "1      817     DB00035  Desmopressin"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = 'drugtarget/identifiers.tsv'\n",
    "id_df = pandas.read_table(path)\n",
    "id_df = id_df.query(\"ID_TYPE == 'DRUGBANK_ID'\")[['DRUG_ID', 'IDENTIFIER']]\n",
    "id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})\n",
    "drugbank_df = id_df.merge(drugbank_df)\n",
    "drugbank_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1634"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(drugbank_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert drug targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GeneID</th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>drugbank_name</th>\n",
       "      <th>TARGET_NAME</th>\n",
       "      <th>TARGET_FAMILY</th>\n",
       "      <th>SOURCE</th>\n",
       "      <th>REFERENCE</th>\n",
       "      <th>action</th>\n",
       "      <th>pubmed_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8233868</td>\n",
       "      <td>DB00431</td>\n",
       "      <td>Lindane</td>\n",
       "      <td>GABA-A receptor</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>CHEMBL</td>\n",
       "      <td>https://www.ebi.ac.uk/chembl/compound/inspect/...</td>\n",
       "      <td>negative allosteric modulator</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8232849</td>\n",
       "      <td>DB08823</td>\n",
       "      <td>Spinosad</td>\n",
       "      <td>Nicotinic acetylcholine receptor</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>CHEMBL</td>\n",
       "      <td>https://www.ebi.ac.uk/chembl/compound/inspect/...</td>\n",
       "      <td>agonist</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    GeneID drugbank_id drugbank_name                       TARGET_NAME  \\\n",
       "0  8233868     DB00431       Lindane                   GABA-A receptor   \n",
       "1  8232849     DB08823      Spinosad  Nicotinic acetylcholine receptor   \n",
       "\n",
       "  TARGET_FAMILY  SOURCE                                          REFERENCE  \\\n",
       "0   Ion channel  CHEMBL  https://www.ebi.ac.uk/chembl/compound/inspect/...   \n",
       "1   Ion channel  CHEMBL  https://www.ebi.ac.uk/chembl/compound/inspect/...   \n",
       "\n",
       "                          action pubmed_id  \n",
       "0  negative allosteric modulator       NaN  \n",
       "1                        agonist       NaN  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = 'drugtarget/drug_target.tsv'\n",
    "target_df = pandas.read_table(path)\n",
    "target_df = drugbank_df.merge(target_df)\n",
    "target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]\n",
    "\n",
    "# Split multi-protein targets into many rows\n",
    "s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()\n",
    "s.index = s.index.droplevel(-1)\n",
    "s.name ='uniprot'\n",
    "del target_df['UNIPROT']\n",
    "target_df = target_df.join(s)\n",
    "\n",
    "target_df = entrez_map_df.merge(target_df)\n",
    "del target_df['uniprot']\n",
    "\n",
    "target_df['action'] = target_df['ACTION_TYPE'].str.lower()\n",
    "del target_df['ACTION_TYPE']\n",
    "\n",
    "target_df['pubmed_id'] = target_df.REFERENCE.str.extract('pubmed/([0-9]+)')\n",
    "\n",
    "target_df = target_df.drop_duplicates()\n",
    "target_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DrugCentral (ChEMBL)        2922\n",
       "DrugCentral (literature)     182\n",
       "DrugCentral (label)           89\n",
       "DrugCentral (IUPHAR)          56\n",
       "DrugCentral (KEGG DRUG)       25\n",
       "Name: SOURCE, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_source_map = {\n",
    "    'CHEMBL': 'DrugCentral (ChEMBL)',\n",
    "    'SCIENTIFIC LITERATURE': 'DrugCentral (literature)',\n",
    "    'DRUG LABEL': 'DrugCentral (label)',\n",
    "    'IUPHAR': 'DrugCentral (IUPHAR)',\n",
    "    'KEGG DRUG': 'DrugCentral (KEGG DRUG)',\n",
    "}\n",
    "target_df.SOURCE = target_df.SOURCE.map(target_source_map)\n",
    "target_df.SOURCE.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def condense_targets(df):\n",
    "    \"\"\"Condense drug-target relationships.\"\"\"\n",
    "    row = pandas.Series()\n",
    "    row['pubmed_ids'] = '|'.join(sorted(df.pubmed_id.dropna().unique()))\n",
    "    row['sources'] = '|'.join(sorted(df.SOURCE.unique()))\n",
    "    row['actions'] = '|'.join(sorted(df.action.unique()))\n",
    "    row['urls'] = '|'.join(sorted(url for url in df.REFERENCE.unique() if not 'pubmed' in url))\n",
    "    return row\n",
    "    \n",
    "target_df = target_df.groupby(['GeneID', 'drugbank_id', 'drugbank_name']).apply(condense_targets).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "target_df.to_csv('rephetio/targets.tsv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read and process DrugCentral Indications"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = 'drugtarget/drug_indication.tsv'\n",
    "indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})\n",
    "indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})\n",
    "indication_df = do_slim.merge(drugbank_df.merge(indication_df))\n",
    "del indication_df['DRUG_ID']\n",
    "indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]\n",
    "indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})\n",
    "indication_df = indication_df.sort_values(['disease', 'drug'])\n",
    "indication_df = indication_df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Compare to PharmacotherapyDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_id</th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>disease</th>\n",
       "      <th>drug</th>\n",
       "      <th>category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DOID:10652</td>\n",
       "      <td>DB00843</td>\n",
       "      <td>Alzheimer's disease</td>\n",
       "      <td>Donepezil</td>\n",
       "      <td>DM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DOID:10652</td>\n",
       "      <td>DB00674</td>\n",
       "      <td>Alzheimer's disease</td>\n",
       "      <td>Galantamine</td>\n",
       "      <td>DM</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      doid_id drugbank_id              disease         drug category\n",
       "0  DOID:10652     DB00843  Alzheimer's disease    Donepezil       DM\n",
       "1  DOID:10652     DB00674  Alzheimer's disease  Galantamine       DM"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'\n",
    "phcoth_df = pandas.read_table(url)\n",
    "phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]\n",
    "indication_df = indication_df.merge(phcoth_df, how='left')\n",
    "indication_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "671"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(indication_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DM     359\n",
       "NaN    210\n",
       "SYM     77\n",
       "NOT     25\n",
       "Name: category, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indication_df.category.value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "indication_df.to_csv('rephetio/indications.tsv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pharmacologic class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drugbank_id</th>\n",
       "      <th>drugbank_name</th>\n",
       "      <th>class_id</th>\n",
       "      <th>class_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DB00014</td>\n",
       "      <td>Goserelin</td>\n",
       "      <td>N0000175655</td>\n",
       "      <td>Gonadotropin Releasing Hormone Receptor Agonist</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DB00014</td>\n",
       "      <td>Goserelin</td>\n",
       "      <td>N0000175654</td>\n",
       "      <td>Gonadotropin Releasing Hormone Receptor Agonists</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  drugbank_id drugbank_name     class_id  \\\n",
       "0     DB00014     Goserelin  N0000175655   \n",
       "1     DB00014     Goserelin  N0000175654   \n",
       "\n",
       "                                         class_name  \n",
       "0   Gonadotropin Releasing Hormone Receptor Agonist  \n",
       "1  Gonadotropin Releasing Hormone Receptor Agonists  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = 'drugtarget/pharm_class.tsv'\n",
    "class_df = pandas.read_table(path)\n",
    "class_df = drugbank_df.merge(class_df)\n",
    "classes_df = class_df[['TYPE', 'CLASS_SOURCE_ID', 'CLASS', 'SOURCE']].drop_duplicates()\n",
    "class_df = class_df[['drugbank_id', 'drugbank_name', 'CLASS_SOURCE_ID', 'CLASS']]\n",
    "class_df = class_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name'})\n",
    "class_df = class_df.drop_duplicates()\n",
    "class_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1262"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Pharmacologic mappings\n",
    "len(classes_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10959"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Class to Drug mappings\n",
    "len(class_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>class_id</th>\n",
       "      <th>class_name</th>\n",
       "      <th>class_source</th>\n",
       "      <th>class_type</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>CHEBI:21241</td>\n",
       "      <td>vitamin C</td>\n",
       "      <td>CHEBI</td>\n",
       "      <td>Application</td>\n",
       "      <td>http://identifiers.org/chebi/CHEBI%3A21241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4385</th>\n",
       "      <td>CHEBI:22153</td>\n",
       "      <td>acaricide</td>\n",
       "      <td>CHEBI</td>\n",
       "      <td>Application</td>\n",
       "      <td>http://identifiers.org/chebi/CHEBI%3A22153</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         class_id class_name class_source   class_type  \\\n",
       "73    CHEBI:21241  vitamin C        CHEBI  Application   \n",
       "4385  CHEBI:22153  acaricide        CHEBI  Application   \n",
       "\n",
       "                                             url  \n",
       "73    http://identifiers.org/chebi/CHEBI%3A21241  \n",
       "4385  http://identifiers.org/chebi/CHEBI%3A22153  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_type_map = {\n",
    "    'MoA': 'Mechanism of Action',\n",
    "    'PE': 'Physiologic Effect',\n",
    "    'CS': 'Chemical Structure',\n",
    "    'EPC': 'FDA Established Pharmacologic Class',\n",
    "    'PA': 'Pharmacological Action',\n",
    "    'has role': 'Application',\n",
    "    'Chemical/Ingredient': 'Chemical/Ingredient',\n",
    "}\n",
    "\n",
    "def get_class_url(class_source, class_id):\n",
    "    \"\"\"Create URLs for pharmacological classes based on their source\"\"\"\n",
    "    class_id = urllib.parse.quote(class_id)\n",
    "    if class_source == 'CHEBI':\n",
    "        return 'http://identifiers.org/chebi/{}'.format(class_id)\n",
    "    if class_source == 'MeSH':\n",
    "        return 'http://identifiers.org/mesh/{}'.format(class_id)\n",
    "    if class_source == 'FDA':\n",
    "        #return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)\n",
    "        # Use bioportal link until something better arises\n",
    "        return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)\n",
    "\n",
    "classes_df['class_type'] = classes_df.TYPE.map(class_type_map)\n",
    "del classes_df['TYPE']\n",
    "classes_df = classes_df.sort_values(['class_type', 'CLASS_SOURCE_ID'])\n",
    "classes_df = classes_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name', 'SOURCE': 'class_source'})\n",
    "classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')\n",
    "classes_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class_df.to_csv('rephetio/drug-to-class.tsv', sep='\\t', index=False)\n",
    "classes_df.to_csv('rephetio/classes.tsv', sep='\\t', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}