# 5.0 Pre-Process 2-Million Mouse Atlas
https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads

In [2]:
import pandas as pd
from glob import glob
import os

In [3]:
all_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files/*'))
len(all_samples)

61

In [4]:
def make_dir(directory):
 import os
 if not os.path.exists(directory):
 os.mkdir(directory)

### Calc Variance of All Genes Across Samples

In [5]:
def calc_gene_mean(inst_sample):
 df_gex = pd.read_parquet(inst_sample + '/gex.parquet')
 print(df_gex.shape)
 inst_mean = df_gex.mean(axis=1)
 return inst_mean

In [13]:
df_meta.columns.tolist()

['Unnamed: 0',
 'all_exon_count',
 'all_intron_count',
 'all_read_count',
 'intergenic_rate',
 'embryo_id',
 'embryo_sex',
 'nuclei_extraction_date',
 'development_stage',
 'Total_mRNAs',
 'num_genes_expressed',
 'Size_Factor',
 'Main_Cluster',
 'Main_cluster_tsne_1',
 'Main_cluster_tsne_2',
 'Sub_cluster',
 'Sub_cluster_tsne_1',
 'Sub_cluster_tsne_2',
 'doublet_score',
 'detected_doublet',
 'doublet_cluster',
 'sub_cluster_id',
 'Main_cell_type',
 'Main_trajectory',
 'Main_trajectory_umap_1',
 'Main_trajectory_umap_2',
 'Main_trajectory_umap_3',
 'Main_trajectory_refined_by_cluster',
 'Main_trajectory_refined_umap_1',
 'Main_trajectory_refined_umap_2',
 'Main_trajectory_refined_umap_3',
 'Sub_trajectory_name',
 'Sub_trajectory_umap_1',
 'Sub_trajectory_umap_2',
 'Sub_trajectory_louvain_component',
 'Sub_trajectory_Pseudotime']

### Drop cells from doublet derived sub-clusters

In [20]:
df_meta_ini = pd.read_parquet(all_samples[0] + '/meta_cell.parquet')
ser_doublet = df_meta_ini['doublet_cluster']
ser_doublet = ser_doublet[ser_doublet == False]
keep_cells = ser_doublet.index.tolist()
df_meta = df_meta_ini.loc[keep_cells]
df_meta.shape

(15666, 36)

### Find the top variable genes in a representative embryo

In [21]:
df_gex = pd.read_parquet(all_samples[0] + '/gex.parquet', columns=keep_cells)
print(df_gex.shape)
inst_mean = df_gex.mean(axis=1)
top_genes = inst_mean.sort_values(ascending=False).index.tolist()[:10000]
keep_genes = df_gex.loc[top_genes].var(axis=1).sort_values(ascending=False).index.tolist()[:5000]

(26183, 15666)


### Make Top 5K Gene Variance Versions of Embryo DataSets
The top 5,000 variable genes were defined based on embryo-1E9.5 gene expression data

In [26]:
base_dir = '../data/cao_2million-cell_2019_61-embryo_parquet_files_binder/'

In [25]:
for inst_sample_path in all_samples:
 inst_sample = inst_sample_path.split('/')[-1]
 new_sample_dir = base_dir + inst_sample

 # drop doublets
 df_meta_ini = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')
 ser_doublet = df_meta_ini['doublet_cluster']
 ser_doublet = ser_doublet[ser_doublet == False]
 keep_cells = ser_doublet.index.tolist()
 df_meta = df_meta_ini.loc[keep_cells] 

 # load gene expression 
 df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet', columns=keep_cells).loc[keep_genes]
 
 # save filtered data for mybinder
 make_dir(new_sample_dir)
 df_meta.to_parquet(new_sample_dir + '/meta_cell.parquet') 
 df_gex.to_parquet(new_sample_dir + '/gex.parquet')
 
 # check file size (MB)
 statinfo = os.stat(new_sample_dir + '/gex.parquet')
 print(inst_sample, df_gex.shape, statinfo.st_size/(1000000))

embryo-1-E9.5 (5000, 15666) 26.023213
embryo-10-E11.5 (5000, 32449) 54.726554
embryo-11-E12.5 (5000, 10270) 14.867986
embryo-12-E12.5 (5000, 27090) 44.015724
embryo-13-E12.5 (5000, 12436) 18.081453
embryo-14-E12.5 (5000, 27450) 44.172131
embryo-15-E13.5 (5000, 23136) 35.579287
embryo-16-E13.5 (5000, 13434) 19.351804
embryo-17-E13.5 (5000, 17306) 25.060539
embryo-19-E9.5 (5000, 4026) 7.010098
embryo-20-E9.5 (5000, 2525) 4.080514
embryo-21-E9.5 (5000, 11550) 18.417185
embryo-22-E9.5 (5000, 5818) 10.062801
embryo-24-E10.5 (5000, 28100) 46.77209
embryo-25-E10.5 (5000, 14498) 23.030284
embryo-26-E10.5 (5000, 24664) 41.623712
embryo-27-E11.5 (5000, 42106) 73.731097
embryo-28-E11.5 (5000, 37761) 65.197931
embryo-29-E11.5 (5000, 33185) 57.158152
embryo-3-E9.5 (5000, 8086) 14.005555
embryo-31-E12.5 (5000, 24208) 36.324759
embryo-33-E12.5 (5000, 57625) 84.85053
embryo-34-E12.5 (5000, 39619) 60.933913
embryo-35-E13.5 (5000, 17118) 24.931888
embryo-36-E13.5 (5000, 22222) 32.530816
embryo-37-E13.5 

### Verifying ~1.3 Million remaining Cells
Compare to processed data after removing cells from doublet derived clusters (https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads).

In [33]:
total_cells = 0
new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))
for inst_sample in new_samples:

 df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
 total_cells = total_cells + df_meta.shape[0]
print(total_cells)

1386587
