{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyzing the SIDER 4.1 data\n", "\n", "+ http://sideeffects.embl.de/\n", "+ http://thinklab.com/d/30#4" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import csv\n", "import gzip\n", "import collections" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2016-02-03 14:13:11 URL:http://sideeffects.embl.de/media/download//README [2270/2270] -> \"download/README\" [1]\r\n" ] } ], "source": [ "# Download SIDER data\n", "base_url = 'http://sideeffects.embl.de/media/download/'\n", "filenames = [\n", " 'README',\n", " 'meddra_all_indications.tsv.gz',\n", " 'meddra_all_se.tsv.gz',\n", " 'meddra_freq.tsv.gz',\n", "]\n", "for filename in filenames:\n", " ! wget --no-verbose --timestamping --directory-prefix download {base_url}/{filename}\n", "\n", "! mv download/README download/README.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STITCH to DrugBank mapping utilities" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def stitch_flat_to_pubchem(cid):\n", " assert cid.startswith('CID')\n", " return int(cid[3:]) - 1e8\n", "\n", "def stitch_stereo_to_pubchem(cid):\n", " assert cid.startswith('CID')\n", " return int(cid[3:])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read DrugBank terms\n", "url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'\n", "drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})\n", "\n", "# Pubchem to DrugBank mapping\n", "url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'\n", "drugbank_map_df = pandas.read_table(url)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## meddra_freq.tsv.gz" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stitch_id_flatstitch_id_sterioumls_cui_from_labelplacebofrequencyloweruppermeddra_typeumls_cui_from_meddraside_effect_name
0CID100000085CID000010917C0000737NaN21%0.210.21LLTC0000737Abdominal pain
1CID100000085CID000010917C0000737NaN21%0.210.21PTC0000737Abdominal pain
\n", "
" ], "text/plain": [ " stitch_id_flat stitch_id_sterio umls_cui_from_label placebo frequency \\\n", "0 CID100000085 CID000010917 C0000737 NaN 21% \n", "1 CID100000085 CID000010917 C0000737 NaN 21% \n", "\n", " lower upper meddra_type umls_cui_from_meddra side_effect_name \n", "0 0.21 0.21 LLT C0000737 Abdominal pain \n", "1 0.21 0.21 PT C0000737 Abdominal pain " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [\n", " 'stitch_id_flat',\n", " 'stitch_id_sterio',\n", " 'umls_cui_from_label',\n", " 'placebo',\n", " 'frequency',\n", " 'lower',\n", " 'upper',\n", " 'meddra_type',\n", " 'umls_cui_from_meddra',\n", " 'side_effect_name',\n", "]\n", "freq_df = pandas.read_table('download/meddra_freq.tsv.gz', names=columns)\n", "freq_df.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## meddra_all_se.tsv.gz" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_idpubchem_idstitch_id_flatstitch_id_sterioumls_cui_from_labelmeddra_typeumls_cui_from_meddraside_effect_name
0DB0001447725CID100047725CID000047725C0000737LLTC0000737Abdominal pain
1DB0001447725CID100047725CID000047725C0000737PTC0687713Gastrointestinal pain
\n", "
" ], "text/plain": [ " drugbank_id pubchem_id stitch_id_flat stitch_id_sterio umls_cui_from_label \\\n", "0 DB00014 47725 CID100047725 CID000047725 C0000737 \n", "1 DB00014 47725 CID100047725 CID000047725 C0000737 \n", "\n", " meddra_type umls_cui_from_meddra side_effect_name \n", "0 LLT C0000737 Abdominal pain \n", "1 PT C0687713 Gastrointestinal pain " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [\n", " 'stitch_id_flat',\n", " 'stitch_id_sterio',\n", " 'umls_cui_from_label',\n", " 'meddra_type',\n", " 'umls_cui_from_meddra',\n", " 'side_effect_name',\n", "]\n", "se_df = pandas.read_table('download/meddra_all_se.tsv.gz', names=columns)\n", "se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)\n", "se_df = drugbank_map_df.merge(se_df)\n", "se_df.head(2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "153663" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]\n", "se_df = se_df.dropna()\n", "se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])\n", "se_df = drugbank_df.merge(se_df)\n", "se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])\n", "len(se_df)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a reference of side effect IDs and Names\n", "se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()\n", "assert se_terms_df.side_effect_name.duplicated().sum() == 0\n", "se_terms_df = se_terms_df.sort_values('side_effect_name')\n", "se_terms_df.to_csv('data/side-effect-terms.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_iddrugbank_nameumls_cui_from_meddraside_effect_name
80494DB00907CocaineC0085631Agitation
80495DB00907CocaineC0233571Excitement
80486DB00907CocaineC0014549Grand mal convulsion
80487DB00907CocaineC0020517Hypersensitivity
80488DB00907CocaineC0026961Mydriasis
80489DB00907CocaineC0027769Nervousness
80496DB00907CocaineC1145670Respiratory failure
80497DB00907CocaineC1325847Sensitisation
80490DB00907CocaineC0233494Tension
80491DB00907CocaineC0040822Tremor
80492DB00907CocaineC0041582Ulcer
80493DB00907CocaineC0042963Vomiting
\n", "
" ], "text/plain": [ " drugbank_id drugbank_name umls_cui_from_meddra side_effect_name\n", "80494 DB00907 Cocaine C0085631 Agitation\n", "80495 DB00907 Cocaine C0233571 Excitement\n", "80486 DB00907 Cocaine C0014549 Grand mal convulsion\n", "80487 DB00907 Cocaine C0020517 Hypersensitivity\n", "80488 DB00907 Cocaine C0026961 Mydriasis\n", "80489 DB00907 Cocaine C0027769 Nervousness\n", "80496 DB00907 Cocaine C1145670 Respiratory failure\n", "80497 DB00907 Cocaine C1325847 Sensitisation\n", "80490 DB00907 Cocaine C0233494 Tension\n", "80491 DB00907 Cocaine C0040822 Tremor\n", "80492 DB00907 Cocaine C0041582 Ulcer\n", "80493 DB00907 Cocaine C0042963 Vomiting" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Side effects of cocaine\n", "se_df.query(\"drugbank_id == 'DB00907'\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1223" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of drugbank drugs\n", "se_df.drugbank_id.nunique()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "5734" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of UMLS side effects\n", "se_df.umls_cui_from_meddra.nunique()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save side effects\n", "se_df.to_csv('data/side-effects.tsv', sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## meddra_all_indications.tsv.gz" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "columns = [\n", " 'stitch_id_flat',\n", " 'umls_cui_from_label',\n", " 'method',\n", " 'concept_name',\n", " 'meddra_type',\n", " 'umls_cui_from_meddra',\n", " 'meddra_name',\n", "]\n", "indication_df = pandas.read_table('download/meddra_all_indications.tsv.gz', names=columns)\n", "indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drugbank_iddrugbank_namepubchem_idstitch_id_flatumls_cui_from_labelmethodconcept_namemeddra_typeumls_cui_from_meddrameddra_name
1DB00014Goserelin47725CID100047725C0002871text_mentionAnemiaPTC0002871Anaemia
3DB00014Goserelin47725CID100047725C0006142NLP_indicationMalignant neoplasm of breastPTC0006142Breast cancer
\n", "
" ], "text/plain": [ " drugbank_id drugbank_name pubchem_id stitch_id_flat umls_cui_from_label \\\n", "1 DB00014 Goserelin 47725 CID100047725 C0002871 \n", "3 DB00014 Goserelin 47725 CID100047725 C0006142 \n", "\n", " method concept_name meddra_type \\\n", "1 text_mention Anemia PT \n", "3 NLP_indication Malignant neoplasm of breast PT \n", "\n", " umls_cui_from_meddra meddra_name \n", "1 C0002871 Anaemia \n", "3 C0006142 Breast cancer " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))\n", "indication_df = indication_df.query(\"meddra_type == 'PT'\")\n", "indication_df.head(2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['Baclofen',\n", " 'Betamethasone',\n", " 'Carbamazepine',\n", " 'Triamcinolone',\n", " 'Prednisone',\n", " 'Tizanidine',\n", " 'Hydrocortisone',\n", " 'Prednisolone',\n", " 'Methylprednisolone',\n", " 'Mitoxantrone',\n", " 'Dantrolene',\n", " 'Dexamethasone',\n", " 'FTY 720',\n", " 'Dalfampridine',\n", " '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione',\n", " 'Fingolimod']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Multiple Sclerosis indications\n", "indication_df.query(\"umls_cui_from_meddra == 'C0026769'\").drugbank_name.tolist()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Save indications\n", "indication_df.to_csv('data/indications.tsv', sep='\\t', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }