{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute perturbagen signatures from gold signature consensi"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import bz2\n",
"import json\n",
"\n",
"import requests\n",
"import pandas\n",
"import sqlite3\n",
"\n",
"import l1000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Entrez Gene to Symbol mapping"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read symbol to entrez_gene_id mapping\n",
"url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/symbol-map.json'\n",
"symbol_map = json.loads(requests.get(url).text)\n",
"\n",
"# Read dataframe of entrez gene records\n",
"url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'\n",
"entrez_gene_df = pandas.read_table(url)\n",
"renamer = {\n",
" 'GeneID': 'entrez_gene_id',\n",
" 'Symbol': 'symbol',\n",
" 'type_of_gene': 'type_of_gene',\n",
" 'description': 'description',\n",
"}\n",
"entrez_gene_df = entrez_gene_df.rename(columns=renamer)[list(renamer.values())]\n",
"entrez_gene_df.entrez_gene_id = entrez_gene_df.entrez_gene_id.astype(str)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read probe information"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('landmark genes', 978)\n",
"('best inferred gene set size', 10638)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" pr_id | \n",
" pr_gene_id | \n",
" pr_gene_symbol | \n",
" pr_gene_title | \n",
" is_lm | \n",
" is_l1000 | \n",
" is_bing | \n",
" pr_pool_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 1831 | \n",
" 218075_at | \n",
" 8086 | \n",
" AAAS | \n",
" achalasia, adrenocortical insufficiency, alacr... | \n",
" False | \n",
" True | \n",
" True | \n",
" inferred | \n",
"
\n",
" \n",
" 1833 | \n",
" 218434_s_at | \n",
" 65985 | \n",
" AACS | \n",
" acetoacetyl-CoA synthetase | \n",
" False | \n",
" True | \n",
" True | \n",
" inferred | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pr_id pr_gene_id pr_gene_symbol \\\n",
"1831 218075_at 8086 AAAS \n",
"1833 218434_s_at 65985 AACS \n",
"\n",
" pr_gene_title is_lm is_l1000 \\\n",
"1831 achalasia, adrenocortical insufficiency, alacr... False True \n",
"1833 acetoacetyl-CoA synthetase False True \n",
"\n",
" is_bing pr_pool_id \n",
"1831 True inferred \n",
"1833 True inferred "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# construct probe_df\n",
"probe_df = pandas.read_table('data/geneinfo/geneinfo.tsv.gz')\n",
"probe_to_gene = dict(zip(probe_df.pr_id, probe_df.pr_gene_id))\n",
"\n",
"# Landmark probes\n",
"landmark_probe_df = probe_df[probe_df.pr_pool_id.str.startswith('epsilon').fillna(False)]\n",
"print('landmark genes', len(landmark_probe_df))\n",
"\n",
"# BIGS + Landmark probes\n",
"probe_df = probe_df[probe_df.is_bing.fillna(False)]\n",
"# If a gene is an epsilon landmark, do not include imputed probes\n",
"probe_df.query(\"pr_gene_id not in @landmark_probe_df.pr_gene_id or pr_id in @landmark_probe_df.pr_id\")\n",
"print('best inferred gene set size', len(probe_df))\n",
"\n",
"probe_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" entrez_gene_id | \n",
" status | \n",
" symbol | \n",
" type_of_gene | \n",
" description | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100 | \n",
" imputed | \n",
" ADA | \n",
" protein-coding | \n",
" adenosine deaminase | \n",
"
\n",
" \n",
" 1 | \n",
" 1000 | \n",
" imputed | \n",
" CDH2 | \n",
" protein-coding | \n",
" cadherin 2, type 1, N-cadherin (neuronal) | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" entrez_gene_id status symbol type_of_gene \\\n",
"0 100 imputed ADA protein-coding \n",
"1 1000 imputed CDH2 protein-coding \n",
"\n",
" description \n",
"0 adenosine deaminase \n",
"1 cadherin 2, type 1, N-cadherin (neuronal) "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a dataset of the genes represented by probes\n",
"gene_df = probe_df.groupby('pr_gene_id').apply(\n",
" lambda df: pandas.Series({'status': 'measured' if df.pr_pool_id.iloc[0].startswith('epsilon') else 'imputed'})\n",
")\n",
"gene_df.index.name = 'entrez_gene_id'\n",
"gene_df = gene_df.reset_index()\n",
"gene_df = gene_df.merge(entrez_gene_df).sort_values('entrez_gene_id')\n",
"gene_df.to_csv('data/consensi/genes.tsv', index=False, sep='\\t')\n",
"gene_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"imputed 6489\n",
"measured 978\n",
"Name: status, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of genes after converting from probe to gene\n",
"gene_df.status.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"inferred 9490\n",
"epsilon|deltap 786\n",
"epsilon 192\n",
"deltap 165\n",
"Name: pr_pool_id, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Filter BING probes for valid entrez gene IDs\n",
"probe_df = probe_df.query(\"pr_gene_id in @gene_df.entrez_gene_id\")\n",
"\n",
"# Probes in BING\n",
"probe_df.pr_pool_id.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparations"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def run_consensi(sig_expr_df, pert_to_sigs, name):\n",
" \"\"\"Compute consensi signatures\"\"\"\n",
" pert_expr_df = l1000.get_consensus_signatures(sig_expr_df, pert_to_sigs, weighting_subset=landmark_probe_df.pr_id)\n",
" pert_expr_df = l1000.probes_to_genes(pert_expr_df, probe_to_gene)\n",
" pert_expr_df = pert_expr_df.transpose()\n",
" pert_expr_df.index.name = 'perturbagen'\n",
" print(pert_expr_df.shape)\n",
" path = 'data/consensi/consensi-{}.tsv.bz2'.format(name)\n",
" with bz2.BZ2File(path, 'w') as write_file:\n",
" pert_expr_df.reset_index().to_csv(write_file, sep='\\t', index=False, float_format='%.3f')\n",
" return pert_expr_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# open database connection\n",
"connection = sqlite3.connect('data/l1000.db')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" CPU times: user 56min 42s, sys: 9min 9s, total: 1h 5min 52s\n",
"Wall time: 1h 6min 47s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# get all gold signatures\n",
"query = \"SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1\"\n",
"sigs = pandas.read_sql(query, connection).sig_id.tolist()\n",
"\n",
"# get probes and extract signatures\n",
"probes = probe_df.pr_id.tolist()\n",
"path = 'download/modzs.gctx'\n",
"sig_expr_df = l1000.extract_from_gctx(path, probes, sigs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## drugbank consensi"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT unichem.resource_id AS drugbank_id, perts.pert_id, sigs.sig_id, sigs.is_gold\n",
"FROM unichem, perts, sigs\n",
"WHERE unichem.resource = 'drugbank'\n",
"AND unichem.pert_uid = perts.pert_uid\n",
"AND sigs.pert_id = perts.pert_id\n",
"\"\"\"\n",
"sig_df = pandas.read_sql(query, connection)\n",
"sig_df = sig_df.query(\"is_gold == 1\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1170, 7467)\n",
"CPU times: user 1h 54min 57s, sys: 1min 16s, total: 1h 56min 14s\n",
"Wall time: 1h 56min 28s\n"
]
}
],
"source": [
"%%time\n",
"pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('drugbank_id')}\n",
"pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='drugbank')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## knockdown consensi"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_id | \n",
" pert_iname | \n",
" pert_type | \n",
" sig_id | \n",
" pertubation_entrez_gene_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" TRCN0000139323 | \n",
" PTMS | \n",
" trt_sh | \n",
" DER001_HA1E_96H:TRCN0000139323:-666 | \n",
" 5763 | \n",
"
\n",
" \n",
" 1 | \n",
" TRCN0000005420 | \n",
" RIOK3 | \n",
" trt_sh | \n",
" DER001_HA1E_96H:TRCN0000005420:-666 | \n",
" 8780 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_id pert_iname pert_type sig_id \\\n",
"0 TRCN0000139323 PTMS trt_sh DER001_HA1E_96H:TRCN0000139323:-666 \n",
"1 TRCN0000005420 RIOK3 trt_sh DER001_HA1E_96H:TRCN0000005420:-666 \n",
"\n",
" pertubation_entrez_gene_id \n",
"0 5763 \n",
"1 8780 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n",
"FROM perts, sigs\n",
"WHERE sigs.pert_id = perts.pert_id\n",
"AND pert_type = 'trt_sh'\n",
"AND sigs.is_gold = 1\n",
"\"\"\"\n",
"sig_df = pandas.read_sql(query, connection)\n",
"sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)\n",
"sig_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(4326, 7467)\n",
"CPU times: user 5h 38min 35s, sys: 1min 5s, total: 5h 39min 40s\n",
"Wall time: 5h 40min 16s\n"
]
}
],
"source": [
"%%time\n",
"# Condense to perturbagens\n",
"pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}\n",
"pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='knockdown')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## overexpression consensi"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_id | \n",
" pert_iname | \n",
" pert_type | \n",
" sig_id | \n",
" pertubation_entrez_gene_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CMAP-HSF-DSTYK | \n",
" DSTYK | \n",
" trt_oe | \n",
" HSF001_HEK293T_48H:CMAP-HSF-DSTYK:200 | \n",
" 25778 | \n",
"
\n",
" \n",
" 1 | \n",
" CMAP-HSF-DYRK1B | \n",
" DYRK1B | \n",
" trt_oe | \n",
" HSF001_HEK293T_48H:CMAP-HSF-DYRK1B:200 | \n",
" 9149 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_id pert_iname pert_type \\\n",
"0 CMAP-HSF-DSTYK DSTYK trt_oe \n",
"1 CMAP-HSF-DYRK1B DYRK1B trt_oe \n",
"\n",
" sig_id pertubation_entrez_gene_id \n",
"0 HSF001_HEK293T_48H:CMAP-HSF-DSTYK:200 25778 \n",
"1 HSF001_HEK293T_48H:CMAP-HSF-DYRK1B:200 9149 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n",
"FROM perts, sigs\n",
"WHERE sigs.pert_id = perts.pert_id\n",
"AND pert_type = 'trt_oe'\n",
"AND sigs.is_gold = 1\n",
"\"\"\"\n",
"sig_df = pandas.read_sql(query, connection)\n",
"sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)\n",
"sig_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2413, 7467)\n",
"CPU times: user 2h 25min 22s, sys: 6.9 s, total: 2h 25min 29s\n",
"Wall time: 2h 25min 42s\n"
]
}
],
"source": [
"%%time\n",
"# Condense to perturbagens\n",
"pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}\n",
"pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='overexpression')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## consensi for all pert_ids"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pert_id | \n",
" pert_iname | \n",
" pert_type | \n",
" sig_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" BRD-K07762753 | \n",
" aminopurvalanol-a | \n",
" trt_cp | \n",
" CVD001_HUH7_24H:BRD-K07762753-001-03-6:50 | \n",
"
\n",
" \n",
" 1 | \n",
" BRD-A46393198 | \n",
" tetramisole | \n",
" trt_cp | \n",
" CPC004_VCAP_6H:BRD-A46393198-003-10-9:10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pert_id pert_iname pert_type \\\n",
"0 BRD-K07762753 aminopurvalanol-a trt_cp \n",
"1 BRD-A46393198 tetramisole trt_cp \n",
"\n",
" sig_id \n",
"0 CVD001_HUH7_24H:BRD-K07762753-001-03-6:50 \n",
"1 CPC004_VCAP_6H:BRD-A46393198-003-10-9:10 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id\n",
"FROM perts, sigs\n",
"WHERE sigs.pert_id = perts.pert_id\n",
"AND sigs.is_gold = 1\n",
"\"\"\"\n",
"sig_df = pandas.read_sql(query, connection)\n",
"sig_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(38327, 7467)\n",
"CPU times: user 1d 18h 22min 52s, sys: 6min 7s, total: 1d 18h 29min\n",
"Wall time: 1d 18h 33min 36s\n"
]
}
],
"source": [
"%%time\n",
"# Condense to perturbagens\n",
"pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}\n",
"pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='pert_id')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 100 | \n",
" 1000 | \n",
" 10000 | \n",
" 10001 | \n",
" 10005 | \n",
" 10006 | \n",
" 10007 | \n",
" 1001 | \n",
" 10010 | \n",
" 100129250 | \n",
" ... | \n",
" 998 | \n",
" 9984 | \n",
" 9986 | \n",
" 9987 | \n",
" 9988 | \n",
" 9989 | \n",
" 999 | \n",
" 9991 | \n",
" 9994 | \n",
" 9997 | \n",
"
\n",
" \n",
" perturbagen | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 56582 | \n",
" -0.433639 | \n",
" 0.797953 | \n",
" -0.089733 | \n",
" -0.627366 | \n",
" 0.132508 | \n",
" -0.114487 | \n",
" -1.254363 | \n",
" 0.089211 | \n",
" -0.327051 | \n",
" -0.492944 | \n",
" ... | \n",
" -0.128962 | \n",
" -0.018879 | \n",
" -0.026018 | \n",
" -0.300755 | \n",
" 0.447996 | \n",
" 0.39047 | \n",
" 2.441479 | \n",
" -0.325544 | \n",
" 0.481472 | \n",
" -0.690715 | \n",
"
\n",
" \n",
" 5981 | \n",
" 0.953652 | \n",
" -0.170241 | \n",
" -1.344758 | \n",
" 0.200088 | \n",
" 3.032086 | \n",
" 1.175347 | \n",
" -0.768220 | \n",
" 1.000292 | \n",
" 1.883492 | \n",
" 0.335095 | \n",
" ... | \n",
" 0.045078 | \n",
" -0.595802 | \n",
" -0.201252 | \n",
" -0.149790 | \n",
" -1.290155 | \n",
" 1.35908 | \n",
" 0.803665 | \n",
" -1.481666 | \n",
" 0.970973 | \n",
" 0.088374 | \n",
"
\n",
" \n",
"
\n",
"
2 rows × 7467 columns
\n",
"
"
],
"text/plain": [
" 100 1000 10000 10001 10005 10006 \\\n",
"perturbagen \n",
"56582 -0.433639 0.797953 -0.089733 -0.627366 0.132508 -0.114487 \n",
"5981 0.953652 -0.170241 -1.344758 0.200088 3.032086 1.175347 \n",
"\n",
" 10007 1001 10010 100129250 ... 998 \\\n",
"perturbagen ... \n",
"56582 -1.254363 0.089211 -0.327051 -0.492944 ... -0.128962 \n",
"5981 -0.768220 1.000292 1.883492 0.335095 ... 0.045078 \n",
"\n",
" 9984 9986 9987 9988 9989 999 \\\n",
"perturbagen \n",
"56582 -0.018879 -0.026018 -0.300755 0.447996 0.39047 2.441479 \n",
"5981 -0.595802 -0.201252 -0.149790 -1.290155 1.35908 0.803665 \n",
"\n",
" 9991 9994 9997 \n",
"perturbagen \n",
"56582 -0.325544 0.481472 -0.690715 \n",
"5981 -1.481666 0.970973 0.088374 \n",
"\n",
"[2 rows x 7467 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pert_expr_df.head(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}