{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.1 Generate Overview of Embryos\n", "Working on cell type by developmental stage downsampled view of the data. I'll downsample the data from each embryo into cell types. Each embryo has the same developmental stage." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">> clustergrammer2 backend version 0.4.2\n" ] } ], "source": [ "from clustergrammer2 import net\n", "df = {}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from glob import glob\n", "import os\n", "from copy import deepcopy" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def add_cats_from_meta(barcodes, df_meta, add_cat_list):\n", " '''\n", " Add categories from df_meta.\n", " '''\n", "\n", " # get metadata of interest (add_cat_list) from barcodes of interest\n", " df_cats = df_meta.loc[barcodes][add_cat_list]\n", "\n", " # get list of cats\n", " list_cat_ini = [list(x) for x in df_cats.values]\n", "\n", " # add titles to cats\n", " list_cat_titles = [ list([str(x) + ': ' + str(y) for x,y in zip(add_cat_list, a)]) for a in list_cat_ini]\n", "\n", " # add barcodes to new columns\n", " new_cols = [tuple([x] + y) for x,y in zip(barcodes, list_cat_titles)]\n", "\n", " return new_cols" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cell Type Distributions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1386587\n" ] } ], "source": [ "list_cell_types = []\n", "meta_list = []\n", "new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))\n", "for inst_sample in new_samples:\n", "\n", " df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')\n", " meta_list.append(df_meta)\n", " \n", " list_cell_types.extend(list(df_meta['Main_cell_type'].get_values()))\n", " \n", "print(len(list_cell_types))" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "there are 38 cell types\n" ] } ], "source": [ "ser_cell_types = pd.Series(list_cell_types)\n", "all_cell_types = ser_cell_types.value_counts().index.tolist()\n", "\n", "print('there are', len(all_cell_types), 'cell types')\n", "ser_cell_types.value_counts()\n", "ser_pop = ser_cell_types.value_counts()\n", "ser_pop.name = 'Population'" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "ser_pop.to_csv('../data/cao_2million-cell_2019_61-embryo_downsample/cell_type_dist.txt', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(38,)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_pop.shape" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Chondrocytes & osteoblasts 104698\n", "Connective tissue progenitors 98964\n", "Intermediate Mesoderm 89518\n", "Jaw and tooth progenitors 82289\n", "Early mesenchyme 71949\n", "Excitatory neurons 68567\n", "Epithelial cells 66209\n", "Radial glia 65428\n", "Neural progenitor cells 58332\n", "Postmitotic premature neurons 56033\n", "Oligodendrocyte Progenitors 54606\n", "Isthmic organizer cells 48498\n", "Neural Tube 45985\n", "Inhibitory neurons 44658\n", "Myocytes 43197\n", "Definitive erythroid lineage 34205\n", "Chondroctye progenitors 33539\n", "Inhibitory neuron progenitors 31214\n", "Premature oligodendrocyte 29538\n", "Limb mesenchyme 26559\n", "Sensory neurons 26477\n", "Endothelial cells 26431\n", "Stromal cells 23259\n", "Osteoblasts 23223\n", "Schwann cell precursor 23145\n", "Granule neurons 16131\n", "Notochord cells 15481\n", "Primitive erythroid lineage 15138\n", "Inhibitory interneurons 13533\n", "Hepatocytes 11229\n", "White blood cells 9202\n", "Ependymal cell 8566\n", "Cholinergic neurons 7060\n", "Cardiac muscle lineages 4867\n", "Megakaryocytes 3572\n", "Melanocytes 2827\n", "Lens 1954\n", "Neutrophils 506\n", "Name: Population, dtype: int64" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser_pop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate the Average Expression of Each Cell Type in Each Embryo\n", "Add categories for: cell type, developmental stage, embryo id, and population number." ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "dev_dict = {}\n", "dev_dict['E9.5'] = '1-E9.5'\n", "dev_dict['E10.5'] = '2-E10.5'\n", "dev_dict['E11.5'] = '3-E11.5'\n", "dev_dict['E12.5'] = '4-E12.5'\n", "dev_dict['E13.5'] = '5-E13.5'" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "embryo-1-E9.5 (5000, 15666) (5000, 36)\n", "embryo-10-E11.5 (5000, 32449) (5000, 38)\n", "embryo-11-E12.5 (5000, 10270) (5000, 37)\n", "embryo-12-E12.5 (5000, 27090) (5000, 38)\n", "embryo-13-E12.5 (5000, 12436) (5000, 37)\n", "embryo-14-E12.5 (5000, 27450) (5000, 38)\n", "embryo-15-E13.5 (5000, 23136) (5000, 38)\n", "embryo-16-E13.5 (5000, 13434) (5000, 36)\n", "embryo-17-E13.5 (5000, 17306) (5000, 36)\n", "embryo-19-E9.5 (5000, 4026) (5000, 30)\n", "embryo-20-E9.5 (5000, 2525) (5000, 27)\n", "embryo-21-E9.5 (5000, 11550) (5000, 35)\n", "embryo-22-E9.5 (5000, 5818) (5000, 31)\n", "embryo-24-E10.5 (5000, 28100) (5000, 38)\n", "embryo-25-E10.5 (5000, 14498) (5000, 37)\n", "embryo-26-E10.5 (5000, 24664) (5000, 38)\n", "embryo-27-E11.5 (5000, 42106) (5000, 38)\n", "embryo-28-E11.5 (5000, 37761) (5000, 38)\n", "embryo-29-E11.5 (5000, 33185) (5000, 38)\n", "embryo-3-E9.5 (5000, 8086) (5000, 34)\n", "embryo-31-E12.5 (5000, 24208) (5000, 38)\n", "embryo-33-E12.5 (5000, 57625) (5000, 38)\n", "embryo-34-E12.5 (5000, 39619) (5000, 38)\n", "embryo-35-E13.5 (5000, 17118) (5000, 38)\n", "embryo-36-E13.5 (5000, 22222) (5000, 38)\n", "embryo-37-E13.5 (5000, 21655) (5000, 37)\n", "embryo-38-E13.5 (5000, 22056) (5000, 36)\n", "embryo-39-E9.5 (5000, 7064) (5000, 30)\n", "embryo-4-E10.5 (5000, 12559) (5000, 38)\n", "embryo-40-E9.5 (5000, 7017) (5000, 33)\n", "embryo-41-E9.5 (5000, 3885) (5000, 28)\n", "embryo-42-E9.5 (5000, 8541) (5000, 35)\n", "embryo-43-E10.5 (5000, 19422) (5000, 38)\n", "embryo-44-E10.5 (5000, 26715) (5000, 38)\n", "embryo-46-E10.5 (5000, 30976) (5000, 38)\n", "embryo-47-E11.5 (5000, 37763) (5000, 38)\n", "embryo-48-E11.5 (5000, 43105) (5000, 38)\n", "embryo-49-E11.5 (5000, 36490) (5000, 38)\n", "embryo-5-E10.5 (5000, 21987) (5000, 38)\n", "embryo-50-E11.5 (5000, 37226) (5000, 38)\n", "embryo-51-E12.5 (5000, 18053) (5000, 38)\n", "embryo-52-E12.5 (5000, 23163) (5000, 38)\n", "embryo-53-E13.5 (5000, 16348) (5000, 38)\n", "embryo-55-E9.5 (5000, 4397) (5000, 33)\n", "embryo-56-E9.5 (5000, 7770) (5000, 33)\n", "embryo-57-E9.5 (5000, 10115) (5000, 35)\n", "embryo-58-E9.5 (5000, 8048) (5000, 35)\n", "embryo-59-E10.5 (5000, 25696) (5000, 38)\n", "embryo-6-E10.5 (5000, 27174) (5000, 38)\n", "embryo-60-E10.5 (5000, 33564) (5000, 38)\n", "embryo-61-E11.5 (5000, 36558) (5000, 38)\n", "embryo-62-E11.5 (5000, 33504) (5000, 38)\n", "embryo-63-E9.5 (5000, 10729) (5000, 36)\n", "embryo-64-E12.5 (5000, 44238) (5000, 38)\n", "embryo-65-E13.5 (5000, 19457) (5000, 38)\n", "embryo-66-E13.5 (5000, 38067) (5000, 38)\n", "embryo-67-E13.5 (5000, 17780) (5000, 38)\n", "embryo-68-E13.5 (5000, 27869) (5000, 38)\n", "embryo-7-E11.5 (5000, 35416) (5000, 38)\n", "embryo-8-E11.5 (5000, 32655) (5000, 38)\n", "embryo-9-E11.5 (5000, 27177) (5000, 38)\n", "CPU times: user 8min 23s, sys: 4min 9s, total: 12min 32s\n", "Wall time: 7min 1s\n" ] } ], "source": [ "%%time\n", "df_mean_list = []\n", "for inst_sample_path in new_samples:\n", " df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet')\n", " df_meta = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')\n", " inst_sample = inst_sample_path.split('/')[-1]\n", " \n", " inst_embryo = 'embryo-' + inst_sample.split('-')[1]\n", " inst_dev = inst_sample.split('-')[2]\n", " \n", " new_cols = add_cats_from_meta(df_gex.columns.tolist(), \n", " df_meta, \n", " ['Main_cell_type', 'development_stage'])\n", " \n", " # save number of cells in each cluster\n", " ct_pop = pd.Series([x[1] for x in new_cols]).value_counts()\n", " ct_pop.index = [x.split(': ')[1] for x in ct_pop.index.tolist()]\n", " \n", " df_cat = deepcopy(df_gex)\n", " df_cat.columns = new_cols\n", " \n", " df_mi = net.row_tuple_to_multiindex(df_cat.transpose())\n", " df_mean_ini = df_mi.groupby(level='Main_cell_type').mean().transpose()\n", " rows = df_mean_ini.index.tolist()\n", " cols = [(x + '_' + inst_sample.replace('embryo','e'), \n", " 'Cell Type: ' + x, \n", " 'Dev Stage: ' + dev_dict[inst_dev], \n", " 'Embryo: ' + inst_embryo, \n", " 'Pop: ' + str(ct_pop[x])) for x in df_mean_ini.columns.tolist()]\n", " mat = df_mean_ini.get_values()\n", " df_mean = pd.DataFrame(index=rows, columns=cols, data=mat)\n", " print(inst_sample, df_cat.shape, df_mean.shape)\n", " \n", " df_mean_list.append(df_mean)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 2229)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_merge = pd.concat(df_mean_list, axis=1)\n", "df_merge.shape" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "df_save = deepcopy(df_merge)\n", "df_save.columns = [str(x) for x in df_save.columns.tolist()]\n", "df_save.to_parquet('../data/cao_2million-cell_2019_61-embryo_downsample/cao_embryo_cell-type_downsample.parquet')" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[('Cardiac muscle lineages_e-1-E9.5',\n", " 'Cell Type: Cardiac muscle lineages',\n", " 'Dev Stage: 1-E9.5',\n", " 'Embryo: embryo-1',\n", " 'Pop: 263'),\n", " ('Cholinergic neurons_e-1-E9.5',\n", " 'Cell Type: Cholinergic neurons',\n", " 'Dev Stage: 1-E9.5',\n", " 'Embryo: embryo-1',\n", " 'Pop: 76'),\n", " ('Chondroctye progenitors_e-1-E9.5',\n", " 'Cell Type: Chondroctye progenitors',\n", " 'Dev Stage: 1-E9.5',\n", " 'Embryo: embryo-1',\n", " 'Pop: 388')]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_merge.columns.tolist()[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize Downsample Data" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6a5c38029c24cf0b497a10b89aa305c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"Gm42418\", \"ini\": 250, \"clust\": 5, \"rank\": 79, \"rankvar\": 87, \"…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df_merge)\n", "net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "net.clip(-5,5)\n", "net.widget()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize Data for Single Cell Type" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b505b2e25a164e44a062ce8001c2d043", "version_major": 2, "version_minor": 0 }, "text/plain": [ "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"mt-Rnr2\", \"ini\": 250, \"clust\": 9, \"rank\": 135, \"rankvar\": 190,…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "net.load_df(df_merge)\n", "net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: Sensory neurons')\n", "net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')\n", "net.normalize(axis='row', norm_type='zscore')\n", "net.clip(-5,5)\n", "net.widget()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total number of cells represented: 1386587\n" ] } ], "source": [ "cols = df_merge.columns.tolist()\n", "total_cells = 0\n", "for inst_col in cols:\n", " inst_pop = int(inst_col[4].split(': ')[1])\n", " total_cells = total_cells + inst_pop\n", "print('total number of cells represented: ', total_cells)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }