{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.1 Generate Overview of Embryos\n",
    "Working on cell type by developmental stage downsampled view of the data. I'll downsample the data from each embryo into cell types. Each embryo has the same developmental stage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">> clustergrammer2 backend version 0.4.2\n"
     ]
    }
   ],
   "source": [
    "from clustergrammer2 import net\n",
    "df = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from glob import glob\n",
    "import os\n",
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_cats_from_meta(barcodes, df_meta, add_cat_list):\n",
    "    '''\n",
    "    Add categories from df_meta.\n",
    "    '''\n",
    "\n",
    "    # get metadata of interest (add_cat_list) from barcodes of interest\n",
    "    df_cats = df_meta.loc[barcodes][add_cat_list]\n",
    "\n",
    "    # get list of cats\n",
    "    list_cat_ini = [list(x) for x in df_cats.values]\n",
    "\n",
    "    # add titles to cats\n",
    "    list_cat_titles = [ list([str(x) + ': ' + str(y) for x,y in zip(add_cat_list, a)]) for a in list_cat_ini]\n",
    "\n",
    "    # add barcodes to new columns\n",
    "    new_cols = [tuple([x] + y) for x,y in zip(barcodes, list_cat_titles)]\n",
    "\n",
    "    return new_cols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cell Type Distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1386587\n"
     ]
    }
   ],
   "source": [
    "list_cell_types = []\n",
    "meta_list = []\n",
    "new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))\n",
    "for inst_sample in new_samples:\n",
    "\n",
    "    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')\n",
    "    meta_list.append(df_meta)\n",
    "    \n",
    "    list_cell_types.extend(list(df_meta['Main_cell_type'].get_values()))\n",
    "    \n",
    "print(len(list_cell_types))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "there are 38 cell types\n"
     ]
    }
   ],
   "source": [
    "ser_cell_types = pd.Series(list_cell_types)\n",
    "all_cell_types = ser_cell_types.value_counts().index.tolist()\n",
    "\n",
    "print('there are', len(all_cell_types), 'cell types')\n",
    "ser_cell_types.value_counts()\n",
    "ser_pop = ser_cell_types.value_counts()\n",
    "ser_pop.name = 'Population'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser_pop.to_csv('../data/cao_2million-cell_2019_61-embryo_downsample/cell_type_dist.txt', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(38,)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser_pop.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Chondrocytes & osteoblasts       104698\n",
       "Connective tissue progenitors     98964\n",
       "Intermediate Mesoderm             89518\n",
       "Jaw and tooth progenitors         82289\n",
       "Early mesenchyme                  71949\n",
       "Excitatory neurons                68567\n",
       "Epithelial cells                  66209\n",
       "Radial glia                       65428\n",
       "Neural progenitor cells           58332\n",
       "Postmitotic premature neurons     56033\n",
       "Oligodendrocyte Progenitors       54606\n",
       "Isthmic organizer cells           48498\n",
       "Neural Tube                       45985\n",
       "Inhibitory neurons                44658\n",
       "Myocytes                          43197\n",
       "Definitive erythroid lineage      34205\n",
       "Chondroctye progenitors           33539\n",
       "Inhibitory neuron progenitors     31214\n",
       "Premature oligodendrocyte         29538\n",
       "Limb mesenchyme                   26559\n",
       "Sensory neurons                   26477\n",
       "Endothelial cells                 26431\n",
       "Stromal cells                     23259\n",
       "Osteoblasts                       23223\n",
       "Schwann cell precursor            23145\n",
       "Granule neurons                   16131\n",
       "Notochord cells                   15481\n",
       "Primitive erythroid lineage       15138\n",
       "Inhibitory interneurons           13533\n",
       "Hepatocytes                       11229\n",
       "White blood cells                  9202\n",
       "Ependymal cell                     8566\n",
       "Cholinergic neurons                7060\n",
       "Cardiac muscle lineages            4867\n",
       "Megakaryocytes                     3572\n",
       "Melanocytes                        2827\n",
       "Lens                               1954\n",
       "Neutrophils                         506\n",
       "Name: Population, dtype: int64"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser_pop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate the Average Expression of Each Cell Type in Each Embryo\n",
    "Add categories for: cell type, developmental stage, embryo id, and population number."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_dict = {}\n",
    "dev_dict['E9.5']   = '1-E9.5'\n",
    "dev_dict['E10.5']  = '2-E10.5'\n",
    "dev_dict['E11.5']  = '3-E11.5'\n",
    "dev_dict['E12.5']  = '4-E12.5'\n",
    "dev_dict['E13.5']  = '5-E13.5'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "embryo-1-E9.5 (5000, 15666) (5000, 36)\n",
      "embryo-10-E11.5 (5000, 32449) (5000, 38)\n",
      "embryo-11-E12.5 (5000, 10270) (5000, 37)\n",
      "embryo-12-E12.5 (5000, 27090) (5000, 38)\n",
      "embryo-13-E12.5 (5000, 12436) (5000, 37)\n",
      "embryo-14-E12.5 (5000, 27450) (5000, 38)\n",
      "embryo-15-E13.5 (5000, 23136) (5000, 38)\n",
      "embryo-16-E13.5 (5000, 13434) (5000, 36)\n",
      "embryo-17-E13.5 (5000, 17306) (5000, 36)\n",
      "embryo-19-E9.5 (5000, 4026) (5000, 30)\n",
      "embryo-20-E9.5 (5000, 2525) (5000, 27)\n",
      "embryo-21-E9.5 (5000, 11550) (5000, 35)\n",
      "embryo-22-E9.5 (5000, 5818) (5000, 31)\n",
      "embryo-24-E10.5 (5000, 28100) (5000, 38)\n",
      "embryo-25-E10.5 (5000, 14498) (5000, 37)\n",
      "embryo-26-E10.5 (5000, 24664) (5000, 38)\n",
      "embryo-27-E11.5 (5000, 42106) (5000, 38)\n",
      "embryo-28-E11.5 (5000, 37761) (5000, 38)\n",
      "embryo-29-E11.5 (5000, 33185) (5000, 38)\n",
      "embryo-3-E9.5 (5000, 8086) (5000, 34)\n",
      "embryo-31-E12.5 (5000, 24208) (5000, 38)\n",
      "embryo-33-E12.5 (5000, 57625) (5000, 38)\n",
      "embryo-34-E12.5 (5000, 39619) (5000, 38)\n",
      "embryo-35-E13.5 (5000, 17118) (5000, 38)\n",
      "embryo-36-E13.5 (5000, 22222) (5000, 38)\n",
      "embryo-37-E13.5 (5000, 21655) (5000, 37)\n",
      "embryo-38-E13.5 (5000, 22056) (5000, 36)\n",
      "embryo-39-E9.5 (5000, 7064) (5000, 30)\n",
      "embryo-4-E10.5 (5000, 12559) (5000, 38)\n",
      "embryo-40-E9.5 (5000, 7017) (5000, 33)\n",
      "embryo-41-E9.5 (5000, 3885) (5000, 28)\n",
      "embryo-42-E9.5 (5000, 8541) (5000, 35)\n",
      "embryo-43-E10.5 (5000, 19422) (5000, 38)\n",
      "embryo-44-E10.5 (5000, 26715) (5000, 38)\n",
      "embryo-46-E10.5 (5000, 30976) (5000, 38)\n",
      "embryo-47-E11.5 (5000, 37763) (5000, 38)\n",
      "embryo-48-E11.5 (5000, 43105) (5000, 38)\n",
      "embryo-49-E11.5 (5000, 36490) (5000, 38)\n",
      "embryo-5-E10.5 (5000, 21987) (5000, 38)\n",
      "embryo-50-E11.5 (5000, 37226) (5000, 38)\n",
      "embryo-51-E12.5 (5000, 18053) (5000, 38)\n",
      "embryo-52-E12.5 (5000, 23163) (5000, 38)\n",
      "embryo-53-E13.5 (5000, 16348) (5000, 38)\n",
      "embryo-55-E9.5 (5000, 4397) (5000, 33)\n",
      "embryo-56-E9.5 (5000, 7770) (5000, 33)\n",
      "embryo-57-E9.5 (5000, 10115) (5000, 35)\n",
      "embryo-58-E9.5 (5000, 8048) (5000, 35)\n",
      "embryo-59-E10.5 (5000, 25696) (5000, 38)\n",
      "embryo-6-E10.5 (5000, 27174) (5000, 38)\n",
      "embryo-60-E10.5 (5000, 33564) (5000, 38)\n",
      "embryo-61-E11.5 (5000, 36558) (5000, 38)\n",
      "embryo-62-E11.5 (5000, 33504) (5000, 38)\n",
      "embryo-63-E9.5 (5000, 10729) (5000, 36)\n",
      "embryo-64-E12.5 (5000, 44238) (5000, 38)\n",
      "embryo-65-E13.5 (5000, 19457) (5000, 38)\n",
      "embryo-66-E13.5 (5000, 38067) (5000, 38)\n",
      "embryo-67-E13.5 (5000, 17780) (5000, 38)\n",
      "embryo-68-E13.5 (5000, 27869) (5000, 38)\n",
      "embryo-7-E11.5 (5000, 35416) (5000, 38)\n",
      "embryo-8-E11.5 (5000, 32655) (5000, 38)\n",
      "embryo-9-E11.5 (5000, 27177) (5000, 38)\n",
      "CPU times: user 8min 23s, sys: 4min 9s, total: 12min 32s\n",
      "Wall time: 7min 1s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df_mean_list = []\n",
    "for inst_sample_path in new_samples:\n",
    "    df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet')\n",
    "    df_meta = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')\n",
    "    inst_sample = inst_sample_path.split('/')[-1]\n",
    "    \n",
    "    inst_embryo = 'embryo-' + inst_sample.split('-')[1]\n",
    "    inst_dev = inst_sample.split('-')[2]\n",
    "    \n",
    "    new_cols = add_cats_from_meta(df_gex.columns.tolist(), \n",
    "                                  df_meta, \n",
    "                                  ['Main_cell_type', 'development_stage'])\n",
    "    \n",
    "    # save number of cells in each cluster\n",
    "    ct_pop = pd.Series([x[1] for x in new_cols]).value_counts()\n",
    "    ct_pop.index = [x.split(': ')[1] for x in ct_pop.index.tolist()]\n",
    "    \n",
    "    df_cat = deepcopy(df_gex)\n",
    "    df_cat.columns = new_cols\n",
    "    \n",
    "    df_mi = net.row_tuple_to_multiindex(df_cat.transpose())\n",
    "    df_mean_ini = df_mi.groupby(level='Main_cell_type').mean().transpose()\n",
    "    rows = df_mean_ini.index.tolist()\n",
    "    cols = [(x + '_' + inst_sample.replace('embryo','e'), \n",
    "             'Cell Type: ' + x, \n",
    "             'Dev Stage: ' + dev_dict[inst_dev], \n",
    "             'Embryo: ' + inst_embryo, \n",
    "             'Pop: ' + str(ct_pop[x])) for x in df_mean_ini.columns.tolist()]\n",
    "    mat = df_mean_ini.get_values()\n",
    "    df_mean = pd.DataFrame(index=rows, columns=cols, data=mat)\n",
    "    print(inst_sample, df_cat.shape, df_mean.shape)\n",
    "    \n",
    "    df_mean_list.append(df_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5000, 2229)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merge = pd.concat(df_mean_list, axis=1)\n",
    "df_merge.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_save = deepcopy(df_merge)\n",
    "df_save.columns = [str(x) for x in df_save.columns.tolist()]\n",
    "df_save.to_parquet('../data/cao_2million-cell_2019_61-embryo_downsample/cao_embryo_cell-type_downsample.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Cardiac muscle lineages_e-1-E9.5',\n",
       "  'Cell Type: Cardiac muscle lineages',\n",
       "  'Dev Stage: 1-E9.5',\n",
       "  'Embryo: embryo-1',\n",
       "  'Pop: 263'),\n",
       " ('Cholinergic neurons_e-1-E9.5',\n",
       "  'Cell Type: Cholinergic neurons',\n",
       "  'Dev Stage: 1-E9.5',\n",
       "  'Embryo: embryo-1',\n",
       "  'Pop: 76'),\n",
       " ('Chondroctye progenitors_e-1-E9.5',\n",
       "  'Cell Type: Chondroctye progenitors',\n",
       "  'Dev Stage: 1-E9.5',\n",
       "  'Embryo: embryo-1',\n",
       "  'Pop: 388')]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merge.columns.tolist()[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize Downsample Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a6a5c38029c24cf0b497a10b89aa305c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"Gm42418\", \"ini\": 250, \"clust\": 5, \"rank\": 79, \"rankvar\": 87, \"…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "net.load_df(df_merge)\n",
    "net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')\n",
    "net.normalize(axis='row', norm_type='zscore')\n",
    "net.clip(-5,5)\n",
    "net.widget()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize Data for Single Cell Type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b505b2e25a164e44a062ce8001c2d043",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"mt-Rnr2\", \"ini\": 250, \"clust\": 9, \"rank\": 135, \"rankvar\": 190,…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "net.load_df(df_merge)\n",
    "net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: Sensory neurons')\n",
    "net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')\n",
    "net.normalize(axis='row', norm_type='zscore')\n",
    "net.clip(-5,5)\n",
    "net.widget()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total number of cells represented:  1386587\n"
     ]
    }
   ],
   "source": [
    "cols = df_merge.columns.tolist()\n",
    "total_cells = 0\n",
    "for inst_col in cols:\n",
    "    inst_pop = int(inst_col[4].split(': ')[1])\n",
    "    total_cells = total_cells + inst_pop\n",
    "print('total number of cells represented: ', total_cells)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}