{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.0 Pre-Process 2-Million Mouse Atlas\n",
    "https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from glob import glob\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files/*'))\n",
    "len(all_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_dir(directory):\n",
    "    import os\n",
    "    if not os.path.exists(directory):\n",
    "        os.mkdir(directory)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calc Variance of All Genes Across Samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_gene_mean(inst_sample):\n",
    "    df_gex = pd.read_parquet(inst_sample + '/gex.parquet')\n",
    "    print(df_gex.shape)\n",
    "    inst_mean = df_gex.mean(axis=1)\n",
    "    return inst_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Unnamed: 0',\n",
       " 'all_exon_count',\n",
       " 'all_intron_count',\n",
       " 'all_read_count',\n",
       " 'intergenic_rate',\n",
       " 'embryo_id',\n",
       " 'embryo_sex',\n",
       " 'nuclei_extraction_date',\n",
       " 'development_stage',\n",
       " 'Total_mRNAs',\n",
       " 'num_genes_expressed',\n",
       " 'Size_Factor',\n",
       " 'Main_Cluster',\n",
       " 'Main_cluster_tsne_1',\n",
       " 'Main_cluster_tsne_2',\n",
       " 'Sub_cluster',\n",
       " 'Sub_cluster_tsne_1',\n",
       " 'Sub_cluster_tsne_2',\n",
       " 'doublet_score',\n",
       " 'detected_doublet',\n",
       " 'doublet_cluster',\n",
       " 'sub_cluster_id',\n",
       " 'Main_cell_type',\n",
       " 'Main_trajectory',\n",
       " 'Main_trajectory_umap_1',\n",
       " 'Main_trajectory_umap_2',\n",
       " 'Main_trajectory_umap_3',\n",
       " 'Main_trajectory_refined_by_cluster',\n",
       " 'Main_trajectory_refined_umap_1',\n",
       " 'Main_trajectory_refined_umap_2',\n",
       " 'Main_trajectory_refined_umap_3',\n",
       " 'Sub_trajectory_name',\n",
       " 'Sub_trajectory_umap_1',\n",
       " 'Sub_trajectory_umap_2',\n",
       " 'Sub_trajectory_louvain_component',\n",
       " 'Sub_trajectory_Pseudotime']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_meta.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop cells from doublet derived sub-clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15666, 36)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_meta_ini = pd.read_parquet(all_samples[0] + '/meta_cell.parquet')\n",
    "ser_doublet = df_meta_ini['doublet_cluster']\n",
    "ser_doublet = ser_doublet[ser_doublet == False]\n",
    "keep_cells = ser_doublet.index.tolist()\n",
    "df_meta = df_meta_ini.loc[keep_cells]\n",
    "df_meta.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find the top variable genes in a representative embryo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(26183, 15666)\n"
     ]
    }
   ],
   "source": [
    "df_gex = pd.read_parquet(all_samples[0] + '/gex.parquet', columns=keep_cells)\n",
    "print(df_gex.shape)\n",
    "inst_mean = df_gex.mean(axis=1)\n",
    "top_genes = inst_mean.sort_values(ascending=False).index.tolist()[:10000]\n",
    "keep_genes = df_gex.loc[top_genes].var(axis=1).sort_values(ascending=False).index.tolist()[:5000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Make Top 5K Gene Variance Versions of Embryo DataSets\n",
    "The top 5,000 variable genes were defined based on embryo-1E9.5 gene expression data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = '../data/cao_2million-cell_2019_61-embryo_parquet_files_binder/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "embryo-1-E9.5 (5000, 15666) 26.023213\n",
      "embryo-10-E11.5 (5000, 32449) 54.726554\n",
      "embryo-11-E12.5 (5000, 10270) 14.867986\n",
      "embryo-12-E12.5 (5000, 27090) 44.015724\n",
      "embryo-13-E12.5 (5000, 12436) 18.081453\n",
      "embryo-14-E12.5 (5000, 27450) 44.172131\n",
      "embryo-15-E13.5 (5000, 23136) 35.579287\n",
      "embryo-16-E13.5 (5000, 13434) 19.351804\n",
      "embryo-17-E13.5 (5000, 17306) 25.060539\n",
      "embryo-19-E9.5 (5000, 4026) 7.010098\n",
      "embryo-20-E9.5 (5000, 2525) 4.080514\n",
      "embryo-21-E9.5 (5000, 11550) 18.417185\n",
      "embryo-22-E9.5 (5000, 5818) 10.062801\n",
      "embryo-24-E10.5 (5000, 28100) 46.77209\n",
      "embryo-25-E10.5 (5000, 14498) 23.030284\n",
      "embryo-26-E10.5 (5000, 24664) 41.623712\n",
      "embryo-27-E11.5 (5000, 42106) 73.731097\n",
      "embryo-28-E11.5 (5000, 37761) 65.197931\n",
      "embryo-29-E11.5 (5000, 33185) 57.158152\n",
      "embryo-3-E9.5 (5000, 8086) 14.005555\n",
      "embryo-31-E12.5 (5000, 24208) 36.324759\n",
      "embryo-33-E12.5 (5000, 57625) 84.85053\n",
      "embryo-34-E12.5 (5000, 39619) 60.933913\n",
      "embryo-35-E13.5 (5000, 17118) 24.931888\n",
      "embryo-36-E13.5 (5000, 22222) 32.530816\n",
      "embryo-37-E13.5 (5000, 21655) 31.964647\n",
      "embryo-38-E13.5 (5000, 22056) 33.00234\n",
      "embryo-39-E9.5 (5000, 7064) 12.366449\n",
      "embryo-4-E10.5 (5000, 12559) 20.919848\n",
      "embryo-40-E9.5 (5000, 7017) 11.685474\n",
      "embryo-41-E9.5 (5000, 3885) 6.920301\n",
      "embryo-42-E9.5 (5000, 8541) 14.419699\n",
      "embryo-43-E10.5 (5000, 19422) 29.989311\n",
      "embryo-44-E10.5 (5000, 26715) 44.829101\n",
      "embryo-46-E10.5 (5000, 30976) 54.765808\n",
      "embryo-47-E11.5 (5000, 37763) 67.915082\n",
      "embryo-48-E11.5 (5000, 43105) 73.141142\n",
      "embryo-49-E11.5 (5000, 36490) 64.551681\n",
      "embryo-5-E10.5 (5000, 21987) 36.750677\n",
      "embryo-50-E11.5 (5000, 37226) 62.984305\n",
      "embryo-51-E12.5 (5000, 18053) 26.987817\n",
      "embryo-52-E12.5 (5000, 23163) 36.57017\n",
      "embryo-53-E13.5 (5000, 16348) 24.086259\n",
      "embryo-55-E9.5 (5000, 4397) 8.662424\n",
      "embryo-56-E9.5 (5000, 7770) 13.214076\n",
      "embryo-57-E9.5 (5000, 10115) 18.238266\n",
      "embryo-58-E9.5 (5000, 8048) 14.08015\n",
      "embryo-59-E10.5 (5000, 25696) 42.005196\n",
      "embryo-6-E10.5 (5000, 27174) 44.938129\n",
      "embryo-60-E10.5 (5000, 33564) 54.053156\n",
      "embryo-61-E11.5 (5000, 36558) 61.10102\n",
      "embryo-62-E11.5 (5000, 33504) 53.756219\n",
      "embryo-63-E9.5 (5000, 10729) 18.63547\n",
      "embryo-64-E12.5 (5000, 44238) 66.844447\n",
      "embryo-65-E13.5 (5000, 19457) 32.948022\n",
      "embryo-66-E13.5 (5000, 38067) 64.048408\n",
      "embryo-67-E13.5 (5000, 17780) 26.249178\n",
      "embryo-68-E13.5 (5000, 27869) 42.631713\n",
      "embryo-7-E11.5 (5000, 35416) 59.146557\n",
      "embryo-8-E11.5 (5000, 32655) 53.663686\n",
      "embryo-9-E11.5 (5000, 27177) 44.994238\n"
     ]
    }
   ],
   "source": [
    "for inst_sample_path in all_samples:\n",
    "    inst_sample = inst_sample_path.split('/')[-1]\n",
    "    new_sample_dir = base_dir + inst_sample\n",
    "\n",
    "    # drop doublets\n",
    "    df_meta_ini = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')\n",
    "    ser_doublet = df_meta_ini['doublet_cluster']\n",
    "    ser_doublet = ser_doublet[ser_doublet == False]\n",
    "    keep_cells = ser_doublet.index.tolist()\n",
    "    df_meta = df_meta_ini.loc[keep_cells]    \n",
    "\n",
    "    # load gene expression \n",
    "    df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet', columns=keep_cells).loc[keep_genes]\n",
    "    \n",
    "    # save filtered data for mybinder\n",
    "    make_dir(new_sample_dir)\n",
    "    df_meta.to_parquet(new_sample_dir + '/meta_cell.parquet')    \n",
    "    df_gex.to_parquet(new_sample_dir + '/gex.parquet')\n",
    "    \n",
    "    # check file size (MB)\n",
    "    statinfo = os.stat(new_sample_dir + '/gex.parquet')\n",
    "    print(inst_sample, df_gex.shape, statinfo.st_size/(1000000))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Verifying ~1.3 Million remaining Cells\n",
    "Compare to processed data after removing cells from doublet derived clusters (https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1386587\n"
     ]
    }
   ],
   "source": [
    "total_cells = 0\n",
    "new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))\n",
    "for inst_sample in new_samples:\n",
    "\n",
    "    df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')\n",
    "    total_cells = total_cells + df_meta.shape[0]\n",
    "print(total_cells)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}