{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute perturbagen signatures from gold signature consensi" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import bz2\n", "import json\n", "\n", "import requests\n", "import pandas\n", "import sqlite3\n", "\n", "import l1000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Entrez Gene to Symbol mapping" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read symbol to entrez_gene_id mapping\n", "url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/symbol-map.json'\n", "symbol_map = json.loads(requests.get(url).text)\n", "\n", "# Read dataframe of entrez gene records\n", "url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'\n", "entrez_gene_df = pandas.read_table(url)\n", "renamer = {\n", " 'GeneID': 'entrez_gene_id',\n", " 'Symbol': 'symbol',\n", " 'type_of_gene': 'type_of_gene',\n", " 'description': 'description',\n", "}\n", "entrez_gene_df = entrez_gene_df.rename(columns=renamer)[list(renamer.values())]\n", "entrez_gene_df.entrez_gene_id = entrez_gene_df.entrez_gene_id.astype(str)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read probe information" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('landmark genes', 978)\n", "('best inferred gene set size', 10638)\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pr_idpr_gene_idpr_gene_symbolpr_gene_titleis_lmis_l1000is_bingpr_pool_id
1831218075_at8086AAASachalasia, adrenocortical insufficiency, alacr...FalseTrueTrueinferred
1833218434_s_at65985AACSacetoacetyl-CoA synthetaseFalseTrueTrueinferred
\n", "
" ], "text/plain": [ " pr_id pr_gene_id pr_gene_symbol \\\n", "1831 218075_at 8086 AAAS \n", "1833 218434_s_at 65985 AACS \n", "\n", " pr_gene_title is_lm is_l1000 \\\n", "1831 achalasia, adrenocortical insufficiency, alacr... False True \n", "1833 acetoacetyl-CoA synthetase False True \n", "\n", " is_bing pr_pool_id \n", "1831 True inferred \n", "1833 True inferred " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# construct probe_df\n", "probe_df = pandas.read_table('data/geneinfo/geneinfo.tsv.gz')\n", "probe_to_gene = dict(zip(probe_df.pr_id, probe_df.pr_gene_id))\n", "\n", "# Landmark probes\n", "landmark_probe_df = probe_df[probe_df.pr_pool_id.str.startswith('epsilon').fillna(False)]\n", "print('landmark genes', len(landmark_probe_df))\n", "\n", "# BIGS + Landmark probes\n", "probe_df = probe_df[probe_df.is_bing.fillna(False)]\n", "# If a gene is an epsilon landmark, do not include imputed probes\n", "probe_df.query(\"pr_gene_id not in @landmark_probe_df.pr_gene_id or pr_id in @landmark_probe_df.pr_id\")\n", "print('best inferred gene set size', len(probe_df))\n", "\n", "probe_df.head(2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entrez_gene_idstatussymboltype_of_genedescription
0100imputedADAprotein-codingadenosine deaminase
11000imputedCDH2protein-codingcadherin 2, type 1, N-cadherin (neuronal)
\n", "
" ], "text/plain": [ " entrez_gene_id status symbol type_of_gene \\\n", "0 100 imputed ADA protein-coding \n", "1 1000 imputed CDH2 protein-coding \n", "\n", " description \n", "0 adenosine deaminase \n", "1 cadherin 2, type 1, N-cadherin (neuronal) " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a dataset of the genes represented by probes\n", "gene_df = probe_df.groupby('pr_gene_id').apply(\n", " lambda df: pandas.Series({'status': 'measured' if df.pr_pool_id.iloc[0].startswith('epsilon') else 'imputed'})\n", ")\n", "gene_df.index.name = 'entrez_gene_id'\n", "gene_df = gene_df.reset_index()\n", "gene_df = gene_df.merge(entrez_gene_df).sort_values('entrez_gene_id')\n", "gene_df.to_csv('data/consensi/genes.tsv', index=False, sep='\\t')\n", "gene_df.head(2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "imputed 6489\n", "measured 978\n", "Name: status, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Number of genes after converting from probe to gene\n", "gene_df.status.value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "inferred 9490\n", "epsilon|deltap 786\n", "epsilon 192\n", "deltap 165\n", "Name: pr_pool_id, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Filter BING probes for valid entrez gene IDs\n", "probe_df = probe_df.query(\"pr_gene_id in @gene_df.entrez_gene_id\")\n", "\n", "# Probes in BING\n", "probe_df.pr_pool_id.value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparations" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def run_consensi(sig_expr_df, pert_to_sigs, name):\n", " \"\"\"Compute consensi signatures\"\"\"\n", " pert_expr_df = l1000.get_consensus_signatures(sig_expr_df, pert_to_sigs, weighting_subset=landmark_probe_df.pr_id)\n", " pert_expr_df = l1000.probes_to_genes(pert_expr_df, probe_to_gene)\n", " pert_expr_df = pert_expr_df.transpose()\n", " pert_expr_df.index.name = 'perturbagen'\n", " print(pert_expr_df.shape)\n", " path = 'data/consensi/consensi-{}.tsv.bz2'.format(name)\n", " with bz2.BZ2File(path, 'w') as write_file:\n", " pert_expr_df.reset_index().to_csv(write_file, sep='\\t', index=False, float_format='%.3f')\n", " return pert_expr_df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# open database connection\n", "connection = sqlite3.connect('data/l1000.db')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " CPU times: user 56min 42s, sys: 9min 9s, total: 1h 5min 52s\n", "Wall time: 1h 6min 47s\n" ] } ], "source": [ "%%time\n", "\n", "# get all gold signatures\n", "query = \"SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1\"\n", "sigs = pandas.read_sql(query, connection).sig_id.tolist()\n", "\n", "# get probes and extract signatures\n", "probes = probe_df.pr_id.tolist()\n", "path = 'download/modzs.gctx'\n", "sig_expr_df = l1000.extract_from_gctx(path, probes, sigs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## drugbank consensi" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT unichem.resource_id AS drugbank_id, perts.pert_id, sigs.sig_id, sigs.is_gold\n", "FROM unichem, perts, sigs\n", "WHERE unichem.resource = 'drugbank'\n", "AND unichem.pert_uid = perts.pert_uid\n", "AND sigs.pert_id = perts.pert_id\n", "\"\"\"\n", "sig_df = pandas.read_sql(query, connection)\n", "sig_df = sig_df.query(\"is_gold == 1\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1170, 7467)\n", "CPU times: user 1h 54min 57s, sys: 1min 16s, total: 1h 56min 14s\n", "Wall time: 1h 56min 28s\n" ] } ], "source": [ "%%time\n", "pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('drugbank_id')}\n", "pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='drugbank')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## knockdown consensi" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_idpert_inamepert_typesig_idpertubation_entrez_gene_id
0TRCN0000139323PTMStrt_shDER001_HA1E_96H:TRCN0000139323:-6665763
1TRCN0000005420RIOK3trt_shDER001_HA1E_96H:TRCN0000005420:-6668780
\n", "
" ], "text/plain": [ " pert_id pert_iname pert_type sig_id \\\n", "0 TRCN0000139323 PTMS trt_sh DER001_HA1E_96H:TRCN0000139323:-666 \n", "1 TRCN0000005420 RIOK3 trt_sh DER001_HA1E_96H:TRCN0000005420:-666 \n", "\n", " pertubation_entrez_gene_id \n", "0 5763 \n", "1 8780 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n", "FROM perts, sigs\n", "WHERE sigs.pert_id = perts.pert_id\n", "AND pert_type = 'trt_sh'\n", "AND sigs.is_gold = 1\n", "\"\"\"\n", "sig_df = pandas.read_sql(query, connection)\n", "sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)\n", "sig_df.head(2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(4326, 7467)\n", "CPU times: user 5h 38min 35s, sys: 1min 5s, total: 5h 39min 40s\n", "Wall time: 5h 40min 16s\n" ] } ], "source": [ "%%time\n", "# Condense to perturbagens\n", "pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}\n", "pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='knockdown')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## overexpression consensi" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_idpert_inamepert_typesig_idpertubation_entrez_gene_id
0CMAP-HSF-DSTYKDSTYKtrt_oeHSF001_HEK293T_48H:CMAP-HSF-DSTYK:20025778
1CMAP-HSF-DYRK1BDYRK1Btrt_oeHSF001_HEK293T_48H:CMAP-HSF-DYRK1B:2009149
\n", "
" ], "text/plain": [ " pert_id pert_iname pert_type \\\n", "0 CMAP-HSF-DSTYK DSTYK trt_oe \n", "1 CMAP-HSF-DYRK1B DYRK1B trt_oe \n", "\n", " sig_id pertubation_entrez_gene_id \n", "0 HSF001_HEK293T_48H:CMAP-HSF-DSTYK:200 25778 \n", "1 HSF001_HEK293T_48H:CMAP-HSF-DYRK1B:200 9149 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n", "FROM perts, sigs\n", "WHERE sigs.pert_id = perts.pert_id\n", "AND pert_type = 'trt_oe'\n", "AND sigs.is_gold = 1\n", "\"\"\"\n", "sig_df = pandas.read_sql(query, connection)\n", "sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)\n", "sig_df.head(2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(2413, 7467)\n", "CPU times: user 2h 25min 22s, sys: 6.9 s, total: 2h 25min 29s\n", "Wall time: 2h 25min 42s\n" ] } ], "source": [ "%%time\n", "# Condense to perturbagens\n", "pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}\n", "pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='overexpression')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## consensi for all pert_ids" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pert_idpert_inamepert_typesig_id
0BRD-K07762753aminopurvalanol-atrt_cpCVD001_HUH7_24H:BRD-K07762753-001-03-6:50
1BRD-A46393198tetramisoletrt_cpCPC004_VCAP_6H:BRD-A46393198-003-10-9:10
\n", "
" ], "text/plain": [ " pert_id pert_iname pert_type \\\n", "0 BRD-K07762753 aminopurvalanol-a trt_cp \n", "1 BRD-A46393198 tetramisole trt_cp \n", "\n", " sig_id \n", "0 CVD001_HUH7_24H:BRD-K07762753-001-03-6:50 \n", "1 CPC004_VCAP_6H:BRD-A46393198-003-10-9:10 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", "SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n", "FROM perts, sigs\n", "WHERE sigs.pert_id = perts.pert_id\n", "AND sigs.is_gold = 1\n", "\"\"\"\n", "sig_df = pandas.read_sql(query, connection)\n", "sig_df.head(2)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(38327, 7467)\n", "CPU times: user 1d 18h 22min 52s, sys: 6min 7s, total: 1d 18h 29min\n", "Wall time: 1d 18h 33min 36s\n" ] } ], "source": [ "%%time\n", "# Condense to perturbagens\n", "pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}\n", "pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='pert_id')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
10010001000010001100051000610007100110010100129250...99899849986998799889989999999199949997
perturbagen
56582-0.4336390.797953-0.089733-0.6273660.132508-0.114487-1.2543630.089211-0.327051-0.492944...-0.128962-0.018879-0.026018-0.3007550.4479960.390472.441479-0.3255440.481472-0.690715
59810.953652-0.170241-1.3447580.2000883.0320861.175347-0.7682201.0002921.8834920.335095...0.045078-0.595802-0.201252-0.149790-1.2901551.359080.803665-1.4816660.9709730.088374
\n", "

2 rows × 7467 columns

\n", "
" ], "text/plain": [ " 100 1000 10000 10001 10005 10006 \\\n", "perturbagen \n", "56582 -0.433639 0.797953 -0.089733 -0.627366 0.132508 -0.114487 \n", "5981 0.953652 -0.170241 -1.344758 0.200088 3.032086 1.175347 \n", "\n", " 10007 1001 10010 100129250 ... 998 \\\n", "perturbagen ... \n", "56582 -1.254363 0.089211 -0.327051 -0.492944 ... -0.128962 \n", "5981 -0.768220 1.000292 1.883492 0.335095 ... 0.045078 \n", "\n", " 9984 9986 9987 9988 9989 999 \\\n", "perturbagen \n", "56582 -0.018879 -0.026018 -0.300755 0.447996 0.39047 2.441479 \n", "5981 -0.595802 -0.201252 -0.149790 -1.290155 1.35908 0.803665 \n", "\n", " 9991 9994 9997 \n", "perturbagen \n", "56582 -0.325544 0.481472 -0.690715 \n", "5981 -1.481666 0.970973 0.088374 \n", "\n", "[2 rows x 7467 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pert_expr_df.head(2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }