{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.0 2,700 PBMC scRNA-seq\n", "Single cell RNA-seq (scRNA-seq) is a powerful method to interrogate gene expression across thousands of single cells. This method provides thousands of measurements (single cells) across thousands of dimensions (genes). This notebook uses Clustergrammer2 to interactively explore an example dataset measuring the gene expression of 2,700 PBMCs obtained from [10X Genomics](https://www.10xgenomics.com/resources/datasets/). Bulg gene expression signatures of cell types from [CIBERSORT](https://cibersort.stanford.edu/) were used to obtain a tentative cell type for each cell." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">> clustergrammer2 backend version 0.15.0 -- hdbscan\n" ] } ], "source": [ "from clustergrammer2 import net\n", "df = {}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", "import pandas as pd\n", "import numpy as np\n", "from copy import deepcopy\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline \n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def calc_mean_var_disp(df_inst):\n", " mean_arr = []\n", " var_arr = []\n", " mean_names = []\n", " for inst_gene in df_inst.index.tolist():\n", " mean_arr.append( df_inst.loc[inst_gene].mean() )\n", " var_arr.append(df_inst.loc[inst_gene].var())\n", " mean_names.append(inst_gene)\n", "\n", " ser_mean = pd.Series(data=mean_arr, index=mean_names)\n", " ser_var = pd.Series(data=var_arr, index=mean_names) \n", " return ser_mean, ser_var" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def cell_umi_count(df):\n", " sum_arr = []\n", " sum_names = []\n", " for inst_cell in df:\n", " sum_arr.append( df[inst_cell].sum() )\n", " sum_names.append(inst_cell)\n", " \n", " ser_sum = pd.Series(data=sum_arr, index=sum_names)\n", " return ser_sum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load Data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32738, 2700)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = net.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove Ribosomal and Mitochondrial Genes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "32738\n", "32546\n", "['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']\n" ] } ], "source": [ "all_genes = df.index.tolist()\n", "print(len(all_genes))\n", "keep_genes = [x for x in all_genes if 'RPL' not in x]\n", "keep_genes = [x for x in keep_genes if 'RPS' not in x]\n", "print(len(keep_genes))\n", "\n", "df = df.loc[keep_genes]\n", "df.shape\n", "\n", "# Removing Mitochondrial Genes\n", "list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',\n", " 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']\n", "\n", "all_genes = df.index.tolist()\n", "mito_genes = [x for x in all_genes if 'MT-' == x[:3] or \n", " x.split('_')[0] in list_mito_genes]\n", "print(mito_genes)\n", "\n", "keep_genes = [x for x in all_genes if x not in mito_genes]\n", "df = df.loc[keep_genes]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Keep top 5K Expressing Genes" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 2700)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_mean, ser_var = calc_mean_var_disp(df)\n", "\n", "num_keep_mean = 5000\n", "num_top_var = 250\n", "\n", "# filter for top expressing genes\n", "keep_mean = ser_mean.sort_values(ascending=False)[:num_keep_mean].index.tolist()\n", "\n", "df = df.loc[keep_mean]\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find top 250 Variable Genes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "250" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_keep_var = ser_var[keep_mean]\n", "# filter for top variance based\n", "keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]\n", "len(keep_var)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### UMI Normalize GEX Data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(5000, 2700)\n", "AAACATACAACCAC 1.0\n", "AAACATTGAGCTAC 1.0\n", "AAACATTGATCAGC 1.0\n", "AAACCGTGCTTCCG 1.0\n", "AAACCGTGTATGCG 1.0\n", "dtype: float64\n", "CPU times: user 967 ms, sys: 247 ms, total: 1.21 s\n", "Wall time: 1.03 s\n" ] } ], "source": [ "%%time\n", "ser_sum = cell_umi_count(df)\n", "df = df.div(ser_sum)\n", "print(df.shape)\n", "print(df.sum().head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find top expressing genes " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "ser_keep_var = ser_var[keep_mean]\n", "# filter for top variance based\n", "keep_var = ser_keep_var.sort_values(ascending=False).index.tolist()[:num_top_var]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ArcSinh Transform and Z-score GEX Data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(5000, 2700)\n" ] } ], "source": [ "# ArcSinh transform\n", "df = np.arcsinh(df/5)\n", "\n", "# Z-score genes\n", "net.load_df(df)\n", "net.normalize(axis='row', norm_type='zscore')\n", "\n", "# round to two decimal points\n", "df = net.export_df().round(2)\n", "\n", "print(df.shape)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# df.columns = [(x, 'Cell Type: Unknown') for x in df.columns.tolist()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Unlabeled Cells " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "41dfe261048243d5abb818d0d5c6dbd5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "CGM2(network='{\"row_nodes\": [{\"name\": \"FTL\", \"ini\": 250, \"clust\": 236, \"rank\": 245, \"rankvar\": 247}, {\"name\": …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df.loc[keep_var])\n", "net.clip(lower=-5, upper=5)\n", "# net.manual_category(col='Cell Type')\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# man_cat = net.get_manual_category('col', 'Cell Type')\n", "# man_cat['Cell Type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load CIBERSORT gene sigantures" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'row'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'zscore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdf_sig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_sig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/__init__.py\u001b[0m in \u001b[0;36mload_file\u001b[0;34m(self, filename)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mLoad\u001b[0m \u001b[0mTSV\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m '''\n\u001b[0;32m---> 55\u001b[0;31m \u001b[0mload_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_file_as_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/load_data.py\u001b[0m in \u001b[0;36mload_file\u001b[0;34m(net, filename)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mload_file_as_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_file_as_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m''\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/load_data.py\u001b[0m in \u001b[0;36mload_file_as_string\u001b[0;34m(net, file_string, filename)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 43\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_tsv_to_net\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuff\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_stdin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/__init__.py\u001b[0m in \u001b[0;36mload_tsv_to_net\u001b[0;34m(self, file_buffer, filename)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mbe\u001b[0m \u001b[0mpossible\u001b[0m \u001b[0mto\u001b[0m \u001b[0mload\u001b[0m \u001b[0mdata\u001b[0m \u001b[0mwithout\u001b[0m \u001b[0mhaving\u001b[0m \u001b[0mto\u001b[0m \u001b[0mread\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0ma\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m '''\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0mload_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_tsv_to_net\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_vect_post_to_net\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvect_post\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/load_data.py\u001b[0m in \u001b[0;36mload_tsv_to_net\u001b[0;34m(net, file_buffer, filename)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc_df_labels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdf_to_dat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'filename'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/__init__.py\u001b[0m in \u001b[0;36mdf_to_dat\u001b[0;34m(self, df, define_cat_colors)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0mLoad\u001b[0m \u001b[0mPandas\u001b[0m \u001b[0mDataFrame\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mwill\u001b[0m \u001b[0mbe\u001b[0m \u001b[0mdeprecated\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m '''\n\u001b[0;32m--> 248\u001b[0;31m \u001b[0mdata_formats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdf_to_dat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefine_cat_colors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 249\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mset_matrix_colors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'red'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneg\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'blue'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/data_formats.py\u001b[0m in \u001b[0;36mdf_to_dat\u001b[0;34m(net, df, define_cat_colors)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'node_info'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcat_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcat_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0mcategories\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdict_cat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefine_cat_colors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdefine_cat_colors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdat_to_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/clustergrammer2/clustergrammer2/clustergrammer_fun/categories.py\u001b[0m in \u001b[0;36mdict_cat\u001b[0;34m(net, define_cat_colors)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0minst_full_name\u001b[0m \u001b[0;32min\u001b[0m \u001b[0minst_dict\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 133\u001b[0;31m \u001b[0minst_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minst_full_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 134\u001b[0m \u001b[0minst_color\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minst_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0minst_full_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df_sig = net.export_df()\n", "print(df_sig.shape)\n", "\n", "rows = df_sig.index.tolist()\n", "new_rows = [x.split('_')[0] for x in rows]\n", "df_sig.index = new_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ct_color = {}\n", "ct_color['T cells CD8'] = 'red'\n", "ct_color['T cells CD4 naive'] = 'blue'\n", "ct_color['T cells CD4 memory activated'] = 'blue'\n", "ct_color['T cells CD4 memory resting'] = '#87cefa' # sky blue\n", "ct_color['B cells naive'] = 'purple'\n", "ct_color['B cells memory'] = '#DA70D6' # orchid\n", "ct_color['NK cells activated'] = 'yellow'\n", "ct_color['NK cells resting'] = '#FCD116' # sign yellow\n", "ct_color['Monocytes'] = '#98ff98' # mint green\n", "ct_color['Macrophages M0'] = '#D3D3D3' # light grey\n", "ct_color['Macrophages M1'] = '#C0C0C0' # silver\n", "ct_color['Macrophages M2'] = '#A9A9A9' # dark grey\n", "ct_color['N.A.'] = 'white'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def set_cat_colors(axis, cat_index, cat_title=False):\n", " for inst_ct in ct_color:\n", " if cat_title != False:\n", " cat_name = cat_title + ': ' + inst_ct\n", " else:\n", " cat_name = inst_ct\n", " \n", " inst_color = ct_color[inst_ct]\n", " net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "set_cat_colors('col', 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gene_sig = df_sig.idxmax(axis=1)\n", "gs_dict = {}\n", "for inst_gene in gene_sig.index.tolist():\n", " gs_dict[inst_gene] = gene_sig[inst_gene][0]\n", "df_sig_cat = deepcopy(df_sig)\n", "rows = df_sig_cat.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df_sig_cat.index = new_rows\n", "\n", "net.load_df(df_sig_cat)\n", "set_cat_colors('row', 1, 'Cell Type')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.load_df(df_sig_cat)\n", "net.clip(lower=-5, upper=5)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predict Cell Types using CIBERSORT Signatures" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_pred_cat, df_sig_sim, y_info = net.predict_cats_from_sigs(df, df_sig, \n", " predict_level='Cell Type', unknown_thresh=0.05)\n", "df.columns = df_pred_cat.columns.tolist()\n", "print(df_pred_cat.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cell Type Similarity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_sig_sim = df_sig_sim.round(2)\n", "net.load_df(df_sig_sim)\n", "set_cat_colors('col', 1, cat_title='Cell Type')\n", "set_cat_colors('row', 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_sig_sim.columns = df_pred_cat.columns.tolist()\n", "net.load_df(df_sig_sim)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cells in CIBERSORT GEX Space" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rows = df_pred_cat.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df_pred_cat.index = new_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.load_df(df_pred_cat)\n", "net.clip(lower=-5, upper=5)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cells with CIBERSORT Predictions, Top Genes Based on Variance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.loc[keep_var]\n", "rows = df.index.tolist()\n", "new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]\n", "df.index = new_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.load_df(df)\n", "net.clip(lower=-5, upper=5)\n", "net.load_df(net.export_df().round(2))\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!mkdir ../jsons\n", "net.save_dict_to_json(net.viz, '../jsons/pbmc_2700.json')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }