{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.2 Human GEX Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clustergrammer2 backend version 0.2.9\n" ] } ], "source": [ "from clustergrammer2 import net\n", "df = {}\n", "\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from copy import deepcopy\n", "from scipy.spatial.distance import pdist\n", "import itertools as it" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(20400, 7339)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filename = '../data/CITE-seq_data/GSE100866_CBMC_8K_13AB_10X-RNA_umi_HUMAN.csv.gz'\n", "df['gex-ini'] = pd.read_csv(filename, compression='gzip', index_col=0)\n", "df['gex-ini'].shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10, 7265)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "net.load_file('../data/CITE-seq_data/adt_ashz_trim_cats.txt')\n", "df['adt-ini'] = net.export_df()\n", "df['adt-ini'].shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7265\n" ] } ], "source": [ "cols = df['adt-ini'].columns.tolist()\n", "keep_cells = [x[0] for x in cols]\n", "print(len(keep_cells))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Remove HUMAN prefix from genes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "rows = df['gex-ini'].index.tolist()\n", "new_rows = [x.replace('HUMAN_','') for x in rows]\n", "df['gex-ini'].index = new_rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter for trimmed cells only and arcsinh transform" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df['gex-trim'] = df['gex-ini'][keep_cells]\n", "df['gex-trim'] = np.arcsinh(df['gex-trim']/5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Drop ribosomal and mitochondrial genes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(20400, 7265)\n", "20400\n", "20223\n", "['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-RNR1', 'MT-RNR2', 'MT-TD', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TL1', 'MT-TP', 'MT-TT', 'MT-TW', 'MTRF1', 'MTRF1L', 'MTRNR2L1', 'MTRNR2L10', 'MTRNR2L11', 'MTRNR2L12', 'MTRNR2L3', 'MTRNR2L4', 'MTRNR2L5', 'MTRNR2L6', 'MTRNR2L7', 'MTRNR2L8']\n", "(20187, 7265)\n" ] } ], "source": [ "print(df['gex-trim'].shape)\n", "df['gex'] = deepcopy(df['gex-trim'])\n", "all_genes = df['gex'].index.tolist()\n", "print(len(all_genes))\n", "keep_genes = [x for x in all_genes if 'RPL' not in x]\n", "keep_genes = [x for x in keep_genes if 'RPS' not in x]\n", "print(len(keep_genes))\n", "\n", "df['gex'] = df['gex'].loc[keep_genes]\n", "df['gex'].shape\n", "\n", "# Removing Mitochondrial Genes\n", "list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',\n", " 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']\n", "\n", "all_genes = df['gex'].index.tolist()\n", "mito_genes = [x for x in all_genes if 'MT-' == x[:3] or \n", " x.split('_')[0] in list_mito_genes]\n", "\n", "print(mito_genes)\n", "\n", "keep_genes = [x for x in all_genes if x not in mito_genes]\n", "df['gex'] = df['gex'].loc[keep_genes]\n", "print(df['gex'].shape)\n", "\n", "# transfer categories\n", "cols = df['adt-ini'].columns.tolist()\n", "ct_dict = {}\n", "for inst_col in cols:\n", " ct_dict[inst_col[0]] = inst_col[1]\n", " \n", "cols = df['gex'].columns.tolist()\n", "new_cols = [(x, 'Cell Type: ' + ct_dict[x]) for x in cols]\n", "df['gex'].columns = new_cols \n", "\n", "# normalize by UMI count\n", "barcode_umi_sum = df['gex'].sum()\n", "df['gex-umi'] = deepcopy(df['gex'].div(barcode_umi_sum))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(TTGGAACCATGTCTCC, Cell Type: Unknown_5) 1.0\n", "(AACTCTTCAACTGGCC, Cell Type: Unknown_5) 1.0\n", "(ACAGCTAAGCGATGAC, Cell Type: CD4 T cell) 1.0\n", "(CATCGAAGTTCCACGG, Cell Type: Unknown_3) 1.0\n", "(CCGGGATGTAACGACG, Cell Type: Unknown_3) 1.0\n", "(GCTTGAAGTACCGTAT, Cell Type: CD4 T cell) 1.0\n", "(GGCGACTGTCAGAATA, Cell Type: Unknown_2) 1.0\n", "(CGGACGTGTCGAACAG, Cell Type: CD14+ Mono cell_1) 1.0\n", "(CTCGTACAGCGGATCA, Cell Type: CD14+ Mono cell_1) 1.0\n", "(TCGGTAACATGTTGAC, Cell Type: Unknown_5) 1.0\n", "(TCTTTCCTCATTCACT, Cell Type: CD8 T cell) 1.0\n", "(CTAGTGAAGCCTATGT, Cell Type: CD4 T cell) 1.0\n", "(ACATCAGCAATGGACG, Cell Type: Unknown_5) 1.0\n", "(GGCCGATTCACAGTAC, Cell Type: Unknown_3) 1.0\n", "(TACAGTGAGAACAACT, Cell Type: CD4 T cell) 1.0\n", "(CTACATTTCAGTCAGT, Cell Type: CD14+ Mono cell_2) 1.0\n", "(CTGCCTAGTGTTGGGA, Cell Type: CD4 T cell) 1.0\n", "(ATTTCTGCATGCAACT, Cell Type: CD4 T cell) 1.0\n", "(TTTATGCAGGAGTTGC, Cell Type: Unknown_3) 1.0\n", "(CTCGTCACACATAACC, Cell Type: CD16+ Mono cell) 1.0\n", "(AGGTCATCAGTGACAG, Cell Type: Unknown_3) 1.0\n", "(GATGAAAAGCCCAGCT, Cell Type: Unknown_2) 1.0\n", "(TAAGTGCCAACACCTA, Cell Type: Unknown_1) 1.0\n", "(GTCGGGTTCTTCCTTC, Cell Type: CD34+ cell) 1.0\n", "(GATCGATAGACAGAGA, Cell Type: CD4 T cell) 1.0\n", "(GCTGCAGAGCTAGCCC, Cell Type: Unknown_2) 1.0\n", "(TCGCGTTTCGTAGATC, Cell Type: CD14+ Mono cell_2) 1.0\n", "(CTTTGCGCAATCTGCA, Cell Type: CD14+ Mono cell_1) 1.0\n", "(CCGTTCATCCAAACTG, Cell Type: CD4 T cell) 1.0\n", "(ACGGCCAGTGGTGTAG, Cell Type: Unknown_5) 1.0\n", " ... \n", "(AGCGTCGTCCCAAGTA, Cell Type: NK cell) 1.0\n", "(GACGCGTGTAGAAAGG, Cell Type: Unknown_1) 1.0\n", "(CGTGTAAGTGGCTCCA, Cell Type: pDC_1) 1.0\n", "(TGCACCTGTTCGTTGA, Cell Type: NK cell) 1.0\n", "(CGTGTAACACACATGT, Cell Type: NK cell) 1.0\n", "(CGTCAGGAGTGGTCCC, Cell Type: CD14+ Mono cell_1) 1.0\n", "(TCTCTAATCGCCTGAG, Cell Type: NK cell) 1.0\n", "(CTGGTCTGTAAAGGAG, Cell Type: NK cell) 1.0\n", "(AGGGTGAGTGATGCCC, Cell Type: CD4 T cell) 1.0\n", "(GTCGGGTAGTTCGCAT, Cell Type: CD14+ Mono cell_2) 1.0\n", "(TTGCGTCGTGACTCAT, Cell Type: CD4 T cell) 1.0\n", "(GTTACAGAGCGTCTAT, Cell Type: CD14+ Mono cell_2) 1.0\n", "(CGTGTAACAGGAATCG, Cell Type: NK cell) 1.0\n", "(TTCGGTCCACTTACGA, Cell Type: NK cell) 1.0\n", "(AGTCTTTAGCCAACAG, Cell Type: pDC_1) 1.0\n", "(GGTGCGTAGCGATGAC, Cell Type: NK cell) 1.0\n", "(GTCGGGTAGTACGACG, Cell Type: CD4 T cell) 1.0\n", "(GGCGTGTAGGATTCGG, Cell Type: Unknown_1) 1.0\n", "(GTCGGGTAGTGAATTG, Cell Type: CD4 T cell) 1.0\n", "(GCGACCAGTCACTTCC, Cell Type: pDC_1) 1.0\n", "(GCATACAAGCTGAACG, Cell Type: CD14+ Mono cell_2) 1.0\n", "(CGATCGGAGCCGTCGT, Cell Type: CD14+ Mono cell_2) 1.0\n", "(ATCTGCCTCTGACCTC, Cell Type: Unknown_1) 1.0\n", "(CTGAAGTAGGGATCTG, Cell Type: NK cell) 1.0\n", "(GGCGTGTAGAGTGAGA, Cell Type: CD14+ Mono cell_2) 1.0\n", "(AGCGTCGAGTCAAGGC, Cell Type: Unknown_1) 1.0\n", "(AGCGTCGAGTTACGGG, Cell Type: CD4 T cell) 1.0\n", "(TCGCGAGGTAGCCTAT, Cell Type: Unknown_1) 1.0\n", "(GTCGGGTAGTAGCCGA, Cell Type: Unknown_1) 1.0\n", "(TTGCCGTGTAGATTAG, Cell Type: Unknown_1) 1.0\n", "Length: 7265, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['gex-umi'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Keep top 5K genes by sum" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 7265)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_sum = df['gex'].sum(axis=1).sort_values(ascending=False)\n", "keep_genes = ser_sum.index.tolist()[:5000]\n", "df['gex-filt'] = df['gex'].loc[keep_genes]\n", "df['gex-filt'].shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 7265)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_sum = df['gex-umi'].sum(axis=1).sort_values(ascending=False)\n", "keep_genes = ser_sum.index.tolist()[:5000]\n", "df['gex-umi-filt'] = df['gex-umi'].loc[keep_genes]\n", "df['gex-umi-filt'].shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10, 7265)\n", "(5000, 7265)\n", "(5000, 7265)\n" ] } ], "source": [ "print(df['adt-ini'].shape)\n", "print(df['gex-filt'].shape)\n", "print(df['gex-umi-filt'].shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compare Sample-Sample Similarity Across Datasets" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def corr_datasets(name_1, name_2):\n", " dist_arr_1 = pdist(df[name_1].transpose(), metric='cosine')\n", " ser_dist_1 = pd.Series(data=dist_arr_1, name=name_1)\n", "\n", " dist_arr_2 = pdist(df[name_2].transpose(), metric='cosine')\n", " ser_dist_2 = pd.Series(data=dist_arr_2, name=name_2)\n", " df_dist = pd.concat([ser_dist_1, ser_dist_2], axis=1)\n", "\n", " inst_corr = 1 - pdist(df_dist.transpose(), metric='correlation')\n", " print(name_1, 'vs', name_2, inst_corr[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ADT vs GEX" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-ini vs gex-filt 0.670118808363\n" ] } ], "source": [ "corr_datasets('adt-ini', 'gex-filt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ADT GEX vs UMI" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-ini vs gex-umi-filt 0.669929648801\n" ] } ], "source": [ "corr_datasets('adt-ini', 'gex-umi-filt')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-ini vs gex-umi-filt 0.669929648801\n" ] } ], "source": [ "corr_datasets('adt-ini', 'gex-umi-filt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Make Z-scored versions of the data" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# # z-scored ADT\n", "# net.load_df(df['adt-ini'])\n", "# net.normalize(axis='row', norm_type='zscore')\n", "# df['adt-z'] = net.export_df()\n", "\n", "df['adt-z'] = df['adt-ini']\n", "\n", "# Z-scored 5K gex\n", "net.load_df(df['gex-filt'])\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-z'] = net.export_df()\n", "\n", "# Z-scored 5KV-UMI gex\n", "net.load_df(df['gex-umi-filt'])\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-umi-z'] = net.export_df()\n", "\n", "# Z-scored 5K-1K gex\n", "net.load_df(df['gex-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-1K-z'] = net.export_df()\n", "\n", "# Z-scored 5KV-1K-UMI gex\n", "net.load_df(df['gex-umi-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-1K-umi-z'] = net.export_df()\n", "\n", "# Z-scored 5K-1H gex\n", "net.load_df(df['gex-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-1H-z'] = net.export_df()\n", "\n", "# Z-scored 5KV-1H-UMI gex\n", "net.load_df(df['gex-umi-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-1H-umi-z'] = net.export_df()\n", "\n", "# Z-scored 5K-50 gex\n", "net.load_df(df['gex-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-50-z'] = net.export_df()\n", "\n", "# Z-scored 5KV-50-UMI gex\n", "net.load_df(df['gex-umi-filt'])\n", "net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "df['gex-5K-50-umi-z'] = net.export_df()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-z vs gex-5K-z 0.577517466014\n", "adt-z vs gex-5K-umi-z 0.661076306163\n" ] } ], "source": [ "corr_datasets('adt-z', 'gex-5K-z')\n", "corr_datasets('adt-z', 'gex-5K-umi-z')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-z vs gex-5K-1K-z 0.630417530262\n", "adt-z vs gex-5K-1K-umi-z 0.706515334859\n" ] } ], "source": [ "corr_datasets('adt-z', 'gex-5K-1K-z')\n", "corr_datasets('adt-z', 'gex-5K-1K-umi-z')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-z vs gex-5K-1H-z 0.685659326402\n", "adt-z vs gex-5K-1H-umi-z 0.729042130795\n" ] } ], "source": [ "corr_datasets('adt-z', 'gex-5K-1H-z')\n", "corr_datasets('adt-z', 'gex-5K-1H-umi-z')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adt-z vs gex-5K-50-z 0.680044573427\n", "adt-z vs gex-5K-50-umi-z 0.72393620348\n" ] } ], "source": [ "corr_datasets('adt-z', 'gex-5K-50-z')\n", "corr_datasets('adt-z', 'gex-5K-50-umi-z')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50, 7265)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['gex-5K-50-umi-z'].shape" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bc753af0a0e24c698d7b164d7f32afd1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"S100A8\", \"ini\": 50, \"clust\": 22, \"rank\": 20, \"rankvar\": 4, \"gr…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df['gex-5K-50-umi-z'])\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cluster NK cells" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'gex-filt-umi'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'gex-filt-umi'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mKeyError\u001b[0m: 'gex-filt-umi'" ] } ], "source": [ "df['gex-filt-umi'].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = df['gex-filt-umi'].columns.tolist()\n", "keep_cols = [x for x in cols if 'CD14+ Mono' in x[1]]\n", "print(len(keep_cols))\n", "net.load_df(df['gex-filt-umi'][keep_cols])\n", "# net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')\n", "net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.load_df(df['gex-filt-umi'])\n", "net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')\n", "net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net.load_df(df['adt-ini'])\n", "# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')\n", "# net.normalize(axis='row', norm_type='zscore')\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# net.load_df(df['gex-cat-filt'])\n", "# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')\n", "# net.normalize(axis='row', norm_type='zscore')\n", "# net.widget()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }