{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.0 Pre-Process 2-Million Mouse Atlas\n", "https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from glob import glob\n", "import os" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "61" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files/*'))\n", "len(all_samples)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def make_dir(directory):\n", " import os\n", " if not os.path.exists(directory):\n", " os.mkdir(directory)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calc Variance of All Genes Across Samples" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def calc_gene_mean(inst_sample):\n", " df_gex = pd.read_parquet(inst_sample + '/gex.parquet')\n", " print(df_gex.shape)\n", " inst_mean = df_gex.mean(axis=1)\n", " return inst_mean" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Unnamed: 0',\n", " 'all_exon_count',\n", " 'all_intron_count',\n", " 'all_read_count',\n", " 'intergenic_rate',\n", " 'embryo_id',\n", " 'embryo_sex',\n", " 'nuclei_extraction_date',\n", " 'development_stage',\n", " 'Total_mRNAs',\n", " 'num_genes_expressed',\n", " 'Size_Factor',\n", " 'Main_Cluster',\n", " 'Main_cluster_tsne_1',\n", " 'Main_cluster_tsne_2',\n", " 'Sub_cluster',\n", " 'Sub_cluster_tsne_1',\n", " 'Sub_cluster_tsne_2',\n", " 'doublet_score',\n", " 'detected_doublet',\n", " 'doublet_cluster',\n", " 'sub_cluster_id',\n", " 'Main_cell_type',\n", " 'Main_trajectory',\n", " 'Main_trajectory_umap_1',\n", " 'Main_trajectory_umap_2',\n", " 'Main_trajectory_umap_3',\n", " 'Main_trajectory_refined_by_cluster',\n", " 'Main_trajectory_refined_umap_1',\n", " 'Main_trajectory_refined_umap_2',\n", " 'Main_trajectory_refined_umap_3',\n", " 'Sub_trajectory_name',\n", " 'Sub_trajectory_umap_1',\n", " 'Sub_trajectory_umap_2',\n", " 'Sub_trajectory_louvain_component',\n", " 'Sub_trajectory_Pseudotime']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_meta.columns.tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Drop cells from doublet derived sub-clusters" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15666, 36)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_meta_ini = pd.read_parquet(all_samples[0] + '/meta_cell.parquet')\n", "ser_doublet = df_meta_ini['doublet_cluster']\n", "ser_doublet = ser_doublet[ser_doublet == False]\n", "keep_cells = ser_doublet.index.tolist()\n", "df_meta = df_meta_ini.loc[keep_cells]\n", "df_meta.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find the top variable genes in a representative embryo" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(26183, 15666)\n" ] } ], "source": [ "df_gex = pd.read_parquet(all_samples[0] + '/gex.parquet', columns=keep_cells)\n", "print(df_gex.shape)\n", "inst_mean = df_gex.mean(axis=1)\n", "top_genes = inst_mean.sort_values(ascending=False).index.tolist()[:10000]\n", "keep_genes = df_gex.loc[top_genes].var(axis=1).sort_values(ascending=False).index.tolist()[:5000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make Top 5K Gene Variance Versions of Embryo DataSets\n", "The top 5,000 variable genes were defined based on embryo-1E9.5 gene expression data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "base_dir = '../data/cao_2million-cell_2019_61-embryo_parquet_files_binder/'" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "embryo-1-E9.5 (5000, 15666) 26.023213\n", "embryo-10-E11.5 (5000, 32449) 54.726554\n", "embryo-11-E12.5 (5000, 10270) 14.867986\n", "embryo-12-E12.5 (5000, 27090) 44.015724\n", "embryo-13-E12.5 (5000, 12436) 18.081453\n", "embryo-14-E12.5 (5000, 27450) 44.172131\n", "embryo-15-E13.5 (5000, 23136) 35.579287\n", "embryo-16-E13.5 (5000, 13434) 19.351804\n", "embryo-17-E13.5 (5000, 17306) 25.060539\n", "embryo-19-E9.5 (5000, 4026) 7.010098\n", "embryo-20-E9.5 (5000, 2525) 4.080514\n", "embryo-21-E9.5 (5000, 11550) 18.417185\n", "embryo-22-E9.5 (5000, 5818) 10.062801\n", "embryo-24-E10.5 (5000, 28100) 46.77209\n", "embryo-25-E10.5 (5000, 14498) 23.030284\n", "embryo-26-E10.5 (5000, 24664) 41.623712\n", "embryo-27-E11.5 (5000, 42106) 73.731097\n", "embryo-28-E11.5 (5000, 37761) 65.197931\n", "embryo-29-E11.5 (5000, 33185) 57.158152\n", "embryo-3-E9.5 (5000, 8086) 14.005555\n", "embryo-31-E12.5 (5000, 24208) 36.324759\n", "embryo-33-E12.5 (5000, 57625) 84.85053\n", "embryo-34-E12.5 (5000, 39619) 60.933913\n", "embryo-35-E13.5 (5000, 17118) 24.931888\n", "embryo-36-E13.5 (5000, 22222) 32.530816\n", "embryo-37-E13.5 (5000, 21655) 31.964647\n", "embryo-38-E13.5 (5000, 22056) 33.00234\n", "embryo-39-E9.5 (5000, 7064) 12.366449\n", "embryo-4-E10.5 (5000, 12559) 20.919848\n", "embryo-40-E9.5 (5000, 7017) 11.685474\n", "embryo-41-E9.5 (5000, 3885) 6.920301\n", "embryo-42-E9.5 (5000, 8541) 14.419699\n", "embryo-43-E10.5 (5000, 19422) 29.989311\n", "embryo-44-E10.5 (5000, 26715) 44.829101\n", "embryo-46-E10.5 (5000, 30976) 54.765808\n", "embryo-47-E11.5 (5000, 37763) 67.915082\n", "embryo-48-E11.5 (5000, 43105) 73.141142\n", "embryo-49-E11.5 (5000, 36490) 64.551681\n", "embryo-5-E10.5 (5000, 21987) 36.750677\n", "embryo-50-E11.5 (5000, 37226) 62.984305\n", "embryo-51-E12.5 (5000, 18053) 26.987817\n", "embryo-52-E12.5 (5000, 23163) 36.57017\n", "embryo-53-E13.5 (5000, 16348) 24.086259\n", "embryo-55-E9.5 (5000, 4397) 8.662424\n", "embryo-56-E9.5 (5000, 7770) 13.214076\n", "embryo-57-E9.5 (5000, 10115) 18.238266\n", "embryo-58-E9.5 (5000, 8048) 14.08015\n", "embryo-59-E10.5 (5000, 25696) 42.005196\n", "embryo-6-E10.5 (5000, 27174) 44.938129\n", "embryo-60-E10.5 (5000, 33564) 54.053156\n", "embryo-61-E11.5 (5000, 36558) 61.10102\n", "embryo-62-E11.5 (5000, 33504) 53.756219\n", "embryo-63-E9.5 (5000, 10729) 18.63547\n", "embryo-64-E12.5 (5000, 44238) 66.844447\n", "embryo-65-E13.5 (5000, 19457) 32.948022\n", "embryo-66-E13.5 (5000, 38067) 64.048408\n", "embryo-67-E13.5 (5000, 17780) 26.249178\n", "embryo-68-E13.5 (5000, 27869) 42.631713\n", "embryo-7-E11.5 (5000, 35416) 59.146557\n", "embryo-8-E11.5 (5000, 32655) 53.663686\n", "embryo-9-E11.5 (5000, 27177) 44.994238\n" ] } ], "source": [ "for inst_sample_path in all_samples:\n", " inst_sample = inst_sample_path.split('/')[-1]\n", " new_sample_dir = base_dir + inst_sample\n", "\n", " # drop doublets\n", " df_meta_ini = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')\n", " ser_doublet = df_meta_ini['doublet_cluster']\n", " ser_doublet = ser_doublet[ser_doublet == False]\n", " keep_cells = ser_doublet.index.tolist()\n", " df_meta = df_meta_ini.loc[keep_cells] \n", "\n", " # load gene expression \n", " df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet', columns=keep_cells).loc[keep_genes]\n", " \n", " # save filtered data for mybinder\n", " make_dir(new_sample_dir)\n", " df_meta.to_parquet(new_sample_dir + '/meta_cell.parquet') \n", " df_gex.to_parquet(new_sample_dir + '/gex.parquet')\n", " \n", " # check file size (MB)\n", " statinfo = os.stat(new_sample_dir + '/gex.parquet')\n", " print(inst_sample, df_gex.shape, statinfo.st_size/(1000000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Verifying ~1.3 Million remaining Cells\n", "Compare to processed data after removing cells from doublet derived clusters (https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads)." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1386587\n" ] } ], "source": [ "total_cells = 0\n", "new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))\n", "for inst_sample in new_samples:\n", "\n", " df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')\n", " total_cells = total_cells + df_meta.shape[0]\n", "print(total_cells)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }