{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analyzing the SIDER 4.1 data\n",
"\n",
"+ http://sideeffects.embl.de/\n",
"+ http://thinklab.com/d/30#4"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv\n",
"import gzip\n",
"import collections"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2016-02-03 14:13:11 URL:http://sideeffects.embl.de/media/download//README [2270/2270] -> \"download/README\" [1]\r\n"
]
}
],
"source": [
"# Download SIDER data\n",
"base_url = 'http://sideeffects.embl.de/media/download/'\n",
"filenames = [\n",
" 'README',\n",
" 'meddra_all_indications.tsv.gz',\n",
" 'meddra_all_se.tsv.gz',\n",
" 'meddra_freq.tsv.gz',\n",
"]\n",
"for filename in filenames:\n",
" ! wget --no-verbose --timestamping --directory-prefix download {base_url}/{filename}\n",
"\n",
"! mv download/README download/README.txt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STITCH to DrugBank mapping utilities"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def stitch_flat_to_pubchem(cid):\n",
" assert cid.startswith('CID')\n",
" return int(cid[3:]) - 1e8\n",
"\n",
"def stitch_stereo_to_pubchem(cid):\n",
" assert cid.startswith('CID')\n",
" return int(cid[3:])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read DrugBank terms\n",
"url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'\n",
"drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})\n",
"\n",
"# Pubchem to DrugBank mapping\n",
"url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'\n",
"drugbank_map_df = pandas.read_table(url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## meddra_freq.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" stitch_id_flat | \n",
" stitch_id_sterio | \n",
" umls_cui_from_label | \n",
" placebo | \n",
" frequency | \n",
" lower | \n",
" upper | \n",
" meddra_type | \n",
" umls_cui_from_meddra | \n",
" side_effect_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CID100000085 | \n",
" CID000010917 | \n",
" C0000737 | \n",
" NaN | \n",
" 21% | \n",
" 0.21 | \n",
" 0.21 | \n",
" LLT | \n",
" C0000737 | \n",
" Abdominal pain | \n",
"
\n",
" \n",
" 1 | \n",
" CID100000085 | \n",
" CID000010917 | \n",
" C0000737 | \n",
" NaN | \n",
" 21% | \n",
" 0.21 | \n",
" 0.21 | \n",
" PT | \n",
" C0000737 | \n",
" Abdominal pain | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stitch_id_flat stitch_id_sterio umls_cui_from_label placebo frequency \\\n",
"0 CID100000085 CID000010917 C0000737 NaN 21% \n",
"1 CID100000085 CID000010917 C0000737 NaN 21% \n",
"\n",
" lower upper meddra_type umls_cui_from_meddra side_effect_name \n",
"0 0.21 0.21 LLT C0000737 Abdominal pain \n",
"1 0.21 0.21 PT C0000737 Abdominal pain "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = [\n",
" 'stitch_id_flat',\n",
" 'stitch_id_sterio',\n",
" 'umls_cui_from_label',\n",
" 'placebo',\n",
" 'frequency',\n",
" 'lower',\n",
" 'upper',\n",
" 'meddra_type',\n",
" 'umls_cui_from_meddra',\n",
" 'side_effect_name',\n",
"]\n",
"freq_df = pandas.read_table('download/meddra_freq.tsv.gz', names=columns)\n",
"freq_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## meddra_all_se.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" pubchem_id | \n",
" stitch_id_flat | \n",
" stitch_id_sterio | \n",
" umls_cui_from_label | \n",
" meddra_type | \n",
" umls_cui_from_meddra | \n",
" side_effect_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DB00014 | \n",
" 47725 | \n",
" CID100047725 | \n",
" CID000047725 | \n",
" C0000737 | \n",
" LLT | \n",
" C0000737 | \n",
" Abdominal pain | \n",
"
\n",
" \n",
" 1 | \n",
" DB00014 | \n",
" 47725 | \n",
" CID100047725 | \n",
" CID000047725 | \n",
" C0000737 | \n",
" PT | \n",
" C0687713 | \n",
" Gastrointestinal pain | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id pubchem_id stitch_id_flat stitch_id_sterio umls_cui_from_label \\\n",
"0 DB00014 47725 CID100047725 CID000047725 C0000737 \n",
"1 DB00014 47725 CID100047725 CID000047725 C0000737 \n",
"\n",
" meddra_type umls_cui_from_meddra side_effect_name \n",
"0 LLT C0000737 Abdominal pain \n",
"1 PT C0687713 Gastrointestinal pain "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = [\n",
" 'stitch_id_flat',\n",
" 'stitch_id_sterio',\n",
" 'umls_cui_from_label',\n",
" 'meddra_type',\n",
" 'umls_cui_from_meddra',\n",
" 'side_effect_name',\n",
"]\n",
"se_df = pandas.read_table('download/meddra_all_se.tsv.gz', names=columns)\n",
"se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)\n",
"se_df = drugbank_map_df.merge(se_df)\n",
"se_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"153663"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]\n",
"se_df = se_df.dropna()\n",
"se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])\n",
"se_df = drugbank_df.merge(se_df)\n",
"se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])\n",
"len(se_df)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create a reference of side effect IDs and Names\n",
"se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()\n",
"assert se_terms_df.side_effect_name.duplicated().sum() == 0\n",
"se_terms_df = se_terms_df.sort_values('side_effect_name')\n",
"se_terms_df.to_csv('data/side-effect-terms.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" drugbank_name | \n",
" umls_cui_from_meddra | \n",
" side_effect_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 80494 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0085631 | \n",
" Agitation | \n",
"
\n",
" \n",
" 80495 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0233571 | \n",
" Excitement | \n",
"
\n",
" \n",
" 80486 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0014549 | \n",
" Grand mal convulsion | \n",
"
\n",
" \n",
" 80487 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0020517 | \n",
" Hypersensitivity | \n",
"
\n",
" \n",
" 80488 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0026961 | \n",
" Mydriasis | \n",
"
\n",
" \n",
" 80489 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0027769 | \n",
" Nervousness | \n",
"
\n",
" \n",
" 80496 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C1145670 | \n",
" Respiratory failure | \n",
"
\n",
" \n",
" 80497 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C1325847 | \n",
" Sensitisation | \n",
"
\n",
" \n",
" 80490 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0233494 | \n",
" Tension | \n",
"
\n",
" \n",
" 80491 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0040822 | \n",
" Tremor | \n",
"
\n",
" \n",
" 80492 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0041582 | \n",
" Ulcer | \n",
"
\n",
" \n",
" 80493 | \n",
" DB00907 | \n",
" Cocaine | \n",
" C0042963 | \n",
" Vomiting | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id drugbank_name umls_cui_from_meddra side_effect_name\n",
"80494 DB00907 Cocaine C0085631 Agitation\n",
"80495 DB00907 Cocaine C0233571 Excitement\n",
"80486 DB00907 Cocaine C0014549 Grand mal convulsion\n",
"80487 DB00907 Cocaine C0020517 Hypersensitivity\n",
"80488 DB00907 Cocaine C0026961 Mydriasis\n",
"80489 DB00907 Cocaine C0027769 Nervousness\n",
"80496 DB00907 Cocaine C1145670 Respiratory failure\n",
"80497 DB00907 Cocaine C1325847 Sensitisation\n",
"80490 DB00907 Cocaine C0233494 Tension\n",
"80491 DB00907 Cocaine C0040822 Tremor\n",
"80492 DB00907 Cocaine C0041582 Ulcer\n",
"80493 DB00907 Cocaine C0042963 Vomiting"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Side effects of cocaine\n",
"se_df.query(\"drugbank_id == 'DB00907'\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1223"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of drugbank drugs\n",
"se_df.drugbank_id.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"5734"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of UMLS side effects\n",
"se_df.umls_cui_from_meddra.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Save side effects\n",
"se_df.to_csv('data/side-effects.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## meddra_all_indications.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"columns = [\n",
" 'stitch_id_flat',\n",
" 'umls_cui_from_label',\n",
" 'method',\n",
" 'concept_name',\n",
" 'meddra_type',\n",
" 'umls_cui_from_meddra',\n",
" 'meddra_name',\n",
"]\n",
"indication_df = pandas.read_table('download/meddra_all_indications.tsv.gz', names=columns)\n",
"indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" drugbank_id | \n",
" drugbank_name | \n",
" pubchem_id | \n",
" stitch_id_flat | \n",
" umls_cui_from_label | \n",
" method | \n",
" concept_name | \n",
" meddra_type | \n",
" umls_cui_from_meddra | \n",
" meddra_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" DB00014 | \n",
" Goserelin | \n",
" 47725 | \n",
" CID100047725 | \n",
" C0002871 | \n",
" text_mention | \n",
" Anemia | \n",
" PT | \n",
" C0002871 | \n",
" Anaemia | \n",
"
\n",
" \n",
" 3 | \n",
" DB00014 | \n",
" Goserelin | \n",
" 47725 | \n",
" CID100047725 | \n",
" C0006142 | \n",
" NLP_indication | \n",
" Malignant neoplasm of breast | \n",
" PT | \n",
" C0006142 | \n",
" Breast cancer | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" drugbank_id drugbank_name pubchem_id stitch_id_flat umls_cui_from_label \\\n",
"1 DB00014 Goserelin 47725 CID100047725 C0002871 \n",
"3 DB00014 Goserelin 47725 CID100047725 C0006142 \n",
"\n",
" method concept_name meddra_type \\\n",
"1 text_mention Anemia PT \n",
"3 NLP_indication Malignant neoplasm of breast PT \n",
"\n",
" umls_cui_from_meddra meddra_name \n",
"1 C0002871 Anaemia \n",
"3 C0006142 Breast cancer "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))\n",
"indication_df = indication_df.query(\"meddra_type == 'PT'\")\n",
"indication_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['Baclofen',\n",
" 'Betamethasone',\n",
" 'Carbamazepine',\n",
" 'Triamcinolone',\n",
" 'Prednisone',\n",
" 'Tizanidine',\n",
" 'Hydrocortisone',\n",
" 'Prednisolone',\n",
" 'Methylprednisolone',\n",
" 'Mitoxantrone',\n",
" 'Dantrolene',\n",
" 'Dexamethasone',\n",
" 'FTY 720',\n",
" 'Dalfampridine',\n",
" '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione',\n",
" 'Fingolimod']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Multiple Sclerosis indications\n",
"indication_df.query(\"umls_cui_from_meddra == 'C0026769'\").drugbank_name.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Save indications\n",
"indication_df.to_csv('data/indications.tsv', sep='\\t', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}