{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.0 2,700 PBMC scRNA-seq\n", "Single cell RNA-seq (scRNA-seq) is a powerful method to interrogate gene expression across thousands of single cells. This method provides thousands of measurements (single cells) across thousands of dimensions (genes). This notebook uses Clustergrammer2 to interactively explore an example dataset measuring the gene expression of 2,700 PBMCs obtained from [10X Genomics](https://www.10xgenomics.com/resources/datasets/). Bulg gene expression signatures of cell types from [CIBERSORT](https://cibersort.stanford.edu/) were used to obtain a tentative cell type for each cell." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from clustergrammer2 import net\n", "df = {}" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", "import pandas as pd\n", "import numpy as np\n", "from copy import deepcopy\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline \n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def calc_mean_var_disp(df_inst):\n", " mean_arr = []\n", " var_arr = []\n", " mean_names = []\n", " for inst_gene in df_inst.index.tolist():\n", " mean_arr.append( df_inst.loc[inst_gene].mean() )\n", " var_arr.append(df_inst.loc[inst_gene].var())\n", " mean_names.append(inst_gene)\n", "\n", " ser_mean = pd.Series(data=mean_arr, index=mean_names)\n", " ser_var = pd.Series(data=var_arr, index=mean_names) \n", " return ser_mean, ser_var" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def cell_umi_count(df):\n", " sum_arr = []\n", " sum_names = []\n", " for inst_cell in df:\n", " sum_arr.append( df[inst_cell].sum() )\n", " sum_names.append(inst_cell)\n", " \n", " ser_sum = pd.Series(data=sum_arr, index=sum_names)\n", " return ser_sum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load Data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32738, 2700)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = net.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove Ribosomal and Mitochondrial Genes" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32738\n", "32546\n", "['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']\n" ] } ], "source": [ "all_genes = df.index.tolist()\n", "print(len(all_genes))\n", "keep_genes = [x for x in all_genes if 'RPL' not in x]\n", "keep_genes = [x for x in keep_genes if 'RPS' not in x]\n", "print(len(keep_genes))\n", "\n", "df = df.loc[keep_genes]\n", "df.shape\n", "\n", "# Removing Mitochondrial Genes\n", "list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',\n", " 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']\n", "\n", "all_genes = df.index.tolist()\n", "mito_genes = [x for x in all_genes if 'MT-' == x[:3] or \n", " x.split('_')[0] in list_mito_genes]\n", "print(mito_genes)\n", "\n", "keep_genes = [x for x in all_genes if x not in mito_genes]\n", "df = df.loc[keep_genes]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Keep top 5K Expressing Genes" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 2700)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_mean, ser_var = calc_mean_var_disp(df)\n", "\n", "num_keep_mean = 5000\n", "num_top_var = 250\n", "\n", "# filter for top expressing genes\n", "keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_mean].index.tolist()\n", "\n", "df = df.loc[keep_mean]\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find top 250 Variable Genes" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "250" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_keep_var = ser_var[keep_mean]\n", "# filter for top variance based\n", "keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]\n", "len(keep_var)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### UMI Normalize GEX Data" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(5000, 2700)\n", "AAACATACAACCAC 1.0\n", "AAACATTGAGCTAC 1.0\n", "AAACATTGATCAGC 1.0\n", "AAACCGTGCTTCCG 1.0\n", "AAACCGTGTATGCG 1.0\n", "dtype: float64\n", "CPU times: user 889 ms, sys: 248 ms, total: 1.14 s\n", "Wall time: 779 ms\n" ] } ], "source": [ "%%time\n", "ser_sum = cell_umi_count(df)\n", "df = df.div(ser_sum)\n", "print(df.shape)\n", "print(df.sum().head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find top expressing genes " ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "ser_keep_var = ser_var[keep_mean]\n", "# filter for top variance based\n", "keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ArcSinh Transform and Z-score GEX Data" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(5000, 2700)\n" ] } ], "source": [ "# ArcSinh transform\n", "df = np.arcsinh(df/5)\n", "\n", "# Z-score genes\n", "net.load_df(df)\n", "net.normalize(axis='row', norm_type='zscore')\n", "\n", "# round to two decimal points\n", "df = net.export_df().round(2)\n", "\n", "print(df.shape)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# df.columns = [(x, 'Cell Type: Unknown') for x in df.columns.tolist()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Unlabeled Cells " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "\"None of [['FTL', 'FTH1', 'MALAT1', 'TMSB4X', 'B2M', 'LYZ', 'ACTB', 'S100A9', 'CD74', 'HLA-DRA', 'TMSB10', 'CST3', 'EEF1A1', 'S100A4', 'S100A8', 'TPT1', 'HLA-DPB1', 'PTMA', 'NKG7', 'GNLY', 'TYROBP', 'JUNB', 'GNB2L1', 'HLA-C', 'NACA', 'GAPDH', 'LTB', 'HLA-DPA1', 'OAZ1', 'PFN1', 'HLA-DRB1', 'HLA-A', 'COTL1', 'S100A6', 'ACTG1', 'FOS', 'SAT1', 'EIF1', 'LGALS1', 'LST1', 'CCL5', 'AIF1', 'VIM', 'H3F3B', 'SH3BGRL3', 'CYBA', 'FCER1G', 'UBA52', 'EEF1D', 'DUSP1', 'FAU', 'ARHGDIB', 'CFL1', 'HLA-B', 'CTSS', 'IGJ', 'FCN1', 'IFITM2', 'MYL6', 'BTG1', 'COX4I1', 'HLA-E', 'CD52', 'S100A11', 'IL32', 'YBX1', 'GZMB', 'SRGN', 'MYL12A', 'ARPC1B', 'ARPC3', 'PFDN5', 'JUN', 'CD37', 'BTF3', 'EEF1B2', 'PABPC1', 'PSAP', 'UBB', 'ANXA1', 'HNRNPA1', 'S100A10', 'NPC2', 'ATP5G2', 'PPBP', 'UBC', 'GLTSCR2', 'NPM1', 'SLC25A6', 'GSTP1', 'IGLL5', 'ARPC2', 'ZFP36', 'LDHB', 'EMP3', 'GPX1', 'RBM3', 'HLA-DRB5', 'LY6E', 'EIF3K', 'KLF6', 'EEF2', 'CORO1A', 'ISG15', 'ITM2B', 'EIF4A1', 'FXYD5', 'IFITM3', 'NEAT1', 'TYMP', 'LAPTM5', 'PSMA7', 'FCGR3A', 'TUBA1B', 'CLIC1', 'SERF2', 'LGALS2', 'CD48', 'TMEM66', 'HSP90AA1', 'HLA-DQA1', 'CD79A', 'SRSF5', 'GMFG', 'IER2', 'EIF3H', 'CD3D', 'DDX5', 'ATP6V0E1', 'TXNIP', 'PPIB', 'CXCR4', 'GIMAP7', 'HNRNPA2B1', 'PSME1', 'YWHAB', 'ARPC5', 'PYCARD', 'UBXN4', 'MYL12B', 'LIMD2', 'PTPRCAP', 'HMGB2', 'PRDX1', 'CIRBP', 'HMGB1', 'RAC2', 'TIMP1', 'CCL4', 'TALDO1', 'FUS', 'PNRC1', 'HLA-DQB1', 'ALDOA', 'SRP14', 'CEBPB', 'C1orf162', 'CALM1', 'TSC22D3', 'SLC25A5', 'PRELID1', 'HINT1', 'ENO1', 'ID2', 'GIMAP4', 'GIMAP5', 'SRSF7', 'NFKBIA', 'UBXN1', 'EIF1AY', 'CHCHD2', 'UBE2D3', 'CCL3', 'NAP1L1', 'CST7', 'SSR2', 'IFI6', 'HLA-DMA', 'VAMP8', 'CCNI', 'SNX3', 'PRF1', 'IL7R', 'CFD', 'SNRPB', 'C6orf48', 'PLAC8', 'RAB2A', 'GPSM3', 'ARL6IP5', 'ANXA2', 'CTSW', 'SF3B5', 'STK17A', 'VPS28', 'FYB', 'EDF1', 'ATP5L', 'LDHA', 'PSMB8', 'TRAF3IP3', 'SMARCA4', 'PPDPF', 'CAPZB', 'SELL', 'CD79B', 'AP2S1', 'SRSF3', 'CD3E', 'HSPA8', 'CNBP', 'ATP5D', 'C4orf3', 'NDUFA2', 'SH3BGRL', 'SDHB', 'GTF3A', 'PPIA', 'NDUFA4', 'ZFP36L2', 'RBM39', 'CD2', 'BRK1', 'PRDX3', 'RARRES3', 'PSME2', 'ATP5E', 'TPI1', 'RHOG', 'GZMA', 'SERP1', 'CCND3', 'PSMB9', 'AES', 'UBE2D2', 'KIF5B', 'RAN', 'H2AFZ', 'TOMM7', 'ATP5A1', 'EIF4A2', 'RAC1', 'ATP5O', 'DRAP1', 'NOSIP', 'PSMB6', 'ATP5H', 'TMBIM6', 'FGFBP2', 'PPA1']] are in the [index]\"", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkeep_var\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupper\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmanual_category\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Cell Type'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1477\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1478\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1479\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1480\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1899\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Cannot index with multidimensional key'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1901\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_iterable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1902\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1903\u001b[0m \u001b[0;31m# nested tuple slicing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_iterable\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer_for\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_read_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[0;34m(self, key, indexer, axis)\u001b[0m\n\u001b[1;32m 1204\u001b[0m raise KeyError(\n\u001b[1;32m 1205\u001b[0m u\"None of [{key}] are in the [{axis}]\".format(\n\u001b[0;32m-> 1206\u001b[0;31m key=key, axis=self.obj._get_axis_name(axis)))\n\u001b[0m\u001b[1;32m 1207\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1208\u001b[0m \u001b[0;31m# we skip the warning on Categorical/Interval\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: \"None of [['FTL', 'FTH1', 'MALAT1', 'TMSB4X', 'B2M', 'LYZ', 'ACTB', 'S100A9', 'CD74', 'HLA-DRA', 'TMSB10', 'CST3', 'EEF1A1', 'S100A4', 'S100A8', 'TPT1', 'HLA-DPB1', 'PTMA', 'NKG7', 'GNLY', 'TYROBP', 'JUNB', 'GNB2L1', 'HLA-C', 'NACA', 'GAPDH', 'LTB', 'HLA-DPA1', 'OAZ1', 'PFN1', 'HLA-DRB1', 'HLA-A', 'COTL1', 'S100A6', 'ACTG1', 'FOS', 'SAT1', 'EIF1', 'LGALS1', 'LST1', 'CCL5', 'AIF1', 'VIM', 'H3F3B', 'SH3BGRL3', 'CYBA', 'FCER1G', 'UBA52', 'EEF1D', 'DUSP1', 'FAU', 'ARHGDIB', 'CFL1', 'HLA-B', 'CTSS', 'IGJ', 'FCN1', 'IFITM2', 'MYL6', 'BTG1', 'COX4I1', 'HLA-E', 'CD52', 'S100A11', 'IL32', 'YBX1', 'GZMB', 'SRGN', 'MYL12A', 'ARPC1B', 'ARPC3', 'PFDN5', 'JUN', 'CD37', 'BTF3', 'EEF1B2', 'PABPC1', 'PSAP', 'UBB', 'ANXA1', 'HNRNPA1', 'S100A10', 'NPC2', 'ATP5G2', 'PPBP', 'UBC', 'GLTSCR2', 'NPM1', 'SLC25A6', 'GSTP1', 'IGLL5', 'ARPC2', 'ZFP36', 'LDHB', 'EMP3', 'GPX1', 'RBM3', 'HLA-DRB5', 'LY6E', 'EIF3K', 'KLF6', 'EEF2', 'CORO1A', 'ISG15', 'ITM2B', 'EIF4A1', 'FXYD5', 'IFITM3', 'NEAT1', 'TYMP', 'LAPTM5', 'PSMA7', 'FCGR3A', 'TUBA1B', 'CLIC1', 'SERF2', 'LGALS2', 'CD48', 'TMEM66', 'HSP90AA1', 'HLA-DQA1', 'CD79A', 'SRSF5', 'GMFG', 'IER2', 'EIF3H', 'CD3D', 'DDX5', 'ATP6V0E1', 'TXNIP', 'PPIB', 'CXCR4', 'GIMAP7', 'HNRNPA2B1', 'PSME1', 'YWHAB', 'ARPC5', 'PYCARD', 'UBXN4', 'MYL12B', 'LIMD2', 'PTPRCAP', 'HMGB2', 'PRDX1', 'CIRBP', 'HMGB1', 'RAC2', 'TIMP1', 'CCL4', 'TALDO1', 'FUS', 'PNRC1', 'HLA-DQB1', 'ALDOA', 'SRP14', 'CEBPB', 'C1orf162', 'CALM1', 'TSC22D3', 'SLC25A5', 'PRELID1', 'HINT1', 'ENO1', 'ID2', 'GIMAP4', 'GIMAP5', 'SRSF7', 'NFKBIA', 'UBXN1', 'EIF1AY', 'CHCHD2', 'UBE2D3', 'CCL3', 'NAP1L1', 'CST7', 'SSR2', 'IFI6', 'HLA-DMA', 'VAMP8', 'CCNI', 'SNX3', 'PRF1', 'IL7R', 'CFD', 'SNRPB', 'C6orf48', 'PLAC8', 'RAB2A', 'GPSM3', 'ARL6IP5', 'ANXA2', 'CTSW', 'SF3B5', 'STK17A', 'VPS28', 'FYB', 'EDF1', 'ATP5L', 'LDHA', 'PSMB8', 'TRAF3IP3', 'SMARCA4', 'PPDPF', 'CAPZB', 'SELL', 'CD79B', 'AP2S1', 'SRSF3', 'CD3E', 'HSPA8', 'CNBP', 'ATP5D', 'C4orf3', 'NDUFA2', 'SH3BGRL', 'SDHB', 'GTF3A', 'PPIA', 'NDUFA4', 'ZFP36L2', 'RBM39', 'CD2', 'BRK1', 'PRDX3', 'RARRES3', 'PSME2', 'ATP5E', 'TPI1', 'RHOG', 'GZMA', 'SERP1', 'CCND3', 'PSMB9', 'AES', 'UBE2D2', 'KIF5B', 'RAN', 'H2AFZ', 'TOMM7', 'ATP5A1', 'EIF4A2', 'RAC1', 'ATP5O', 'DRAP1', 'NOSIP', 'PSMB6', 'ATP5H', 'TMBIM6', 'FGFBP2', 'PPA1']] are in the [index]\"" ] } ], "source": [ "net.load_df(df.loc[keep_var])\n", "net.clip(lower=-5, upper=5)\n", "# net.manual_category(col='Cell Type')\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# man_cat = net.get_manual_category('col', 'Cell Type')\n", "# man_cat['Cell Type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load CIBERSORT gene sigantures" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(523, 22)\n" ] } ], "source": [ "net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df_sig = net.export_df()\n", "print(df_sig.shape)\n", "\n", "rows = df_sig.index.tolist()\n", "new_rows = [x.split('_')[0] for x in rows]\n", "df_sig.index = new_rows" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "ct_color = {}\n", "ct_color['T cells CD8'] = 'red'\n", "ct_color['T cells CD4 naive'] = 'blue'\n", "ct_color['T cells CD4 memory activated'] = 'blue'\n", "ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue\n", "ct_color['B cells naive'] = 'purple'\n", "ct_color['B cells memory'] = '#DA70D6' # orchid\n", "ct_color['NK cells activated'] = 'yellow'\n", "ct_color['NK cells resting'] = '#FCD116' # sign yellow\n", "ct_color['Monocytes'] = '#98ff98' # mint green\n", "ct_color['Macrophages M0'] = '#D3D3D3' # light grey\n", "ct_color['Macrophages M1'] = '#C0C0C0' # silver\n", "ct_color['Macrophages M2'] = '#A9A9A9' # dark grey\n", "ct_color['N.A.'] = 'white'" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def set_cat_colors(axis, cat_index, cat_title=False):\n", " for inst_ct in ct_color:\n", " if cat_title != False:\n", " cat_name = cat_title + ': ' + inst_ct\n", " else:\n", " cat_name = inst_ct\n", " \n", " inst_color = ct_color[inst_ct]\n", " net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "set_cat_colors('col', 1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "gene_sig = df_sig.idxmax(axis=1)\n", "gs_dict = {}\n", "for inst_gene in gene_sig.index.tolist():\n", " gs_dict[inst_gene] = gene_sig[inst_gene][0]\n", "df_sig_cat = deepcopy(df_sig)\n", "rows = df_sig_cat.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df_sig_cat.index = new_rows\n", "\n", "net.load_df(df_sig_cat)\n", "set_cat_colors('row', 1, 'Cell Type')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5ce7ecd4d4784b38b976142efd417040", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"ABCB4\", \"ini\": 523, \"clust\": 318, \"rank\": 341, \"rankvar\": 8, \"…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df_sig_cat)\n", "net.clip(lower=-5, upper=5)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predict Cell Types using CIBERSORT Signatures" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(188, 2700)\n" ] } ], "source": [ "df_pred_cat, df_sig_sim, y_info = net.predict_cats_from_sigs(df, df_sig, \n", " predict_level='Cell Type', unknown_thresh=0.05)\n", "df.columns = df_pred_cat.columns.tolist()\n", "print(df_pred_cat.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cell Type Similarity" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "df_sig_sim = df_sig_sim.round(2)\n", "net.load_df(df_sig_sim)\n", "set_cat_colors('col', 1, cat_title='Cell Type')\n", "set_cat_colors('row', 1)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e4116d2acc0d4ed88005d3a20356da8c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"B cells naive\", \"ini\": 22, \"clust\": 10, \"rank\": 0, \"rankvar\": …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_sig_sim.columns = df_pred_cat.columns.tolist()\n", "net.load_df(df_sig_sim)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cells in CIBERSORT GEX Space" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "rows = df_pred_cat.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df_pred_cat.index = new_rows" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "829a50fb1a704d81bc5df2604a4f4df0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"C5AR1\", \"ini\": 188, \"clust\": 129, \"rank\": 140, \"rankvar\": 33, …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df_pred_cat)\n", "net.clip(lower=-5, upper=5)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cells with CIBERSORT Predictions, Top Genes Based on Variance" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "df = df.loc[keep_var]\n", "rows = df.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df.index = new_rows" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f850d940da3446d0af5bfcfd6b1a4511", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"FTL\", \"ini\": 250, \"clust\": 236, \"rank\": 245, \"rankvar\": 247, \"…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df)\n", "net.clip(lower=-5, upper=5)\n", "net.load_df(net.export_df().round(2))\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: ../jsons: File exists\n" ] } ], "source": [ "!mkdir ../jsons\n", "net.save_dict_to_json(net.viz, '../jsons/pbmc_2700.json')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }