# 4.0 10K PBMC CITE-seq Assign Cell Type
This notebook performs hierarchical clustering on antibody derived tag (ADT) data, assigns tentative cell types based on ADT clustering, transfers cell type assignment to the gene expression data (GEX), and saves the resulting labeled and filtered data to parquet files.

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
from clustergrammer2 import net
import helper_functions as hf

>> clustergrammer2 backend version 0.4.2


In [3]:
%%time
inst_path = '../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/filtered_feature_bc_matrix/'
feature_data = hf.load_v3_comp_sparse_feat_matrix(inst_path)

CPU times: user 30.8 s, sys: 459 ms, total: 31.2 s
Wall time: 30.7 s


### Clean ADT Features

In [4]:
inst_features = feature_data['adt']['features']
keep_indexes = []
keep_features = []
for inst_index in range(len(inst_features)):
 inst_feature = inst_features[inst_index]
 if '_control' not in inst_feature:
 keep_indexes.append(inst_index)
 keep_features.append(inst_feature)

feature_data['adt']['features'] = keep_features
feature_data['adt']['mat'] = feature_data['adt']['mat'][keep_indexes,:]

In [5]:
hf.check_feature_data_size(feature_data)

gex
33538 7865
(33538, 7865) 

adt
14 7865
(14, 7865) 



In [6]:
%%time
df = hf.convert_feature_data_to_df_dict(feature_data, make_sparse=False)
print('adt', df['adt'].shape)
print('gex', df['gex'].shape)

adt (14, 7865)
gex (33538, 7865)
CPU times: user 385 ms, sys: 506 ms, total: 892 ms
Wall time: 899 ms


# Cluster Cells in ADT Space

In [7]:
df['adt-ash'] = np.arcsinh(df['adt']/5)
df['adt-ash'].shape
df['adt-ash'].columns = [(x,) for x in df['adt-ash'].columns]

In [8]:
net.load_df(df['adt-ash'])
net.normalize(axis='row', norm_type='zscore')
net.cluster()
net.dendro_cats(axis='col', dendro_level=4)
net.cluster()
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "CD3", "ini": 14, "clust": 9, "rank": 11, "rankvar": 9, "group"…

## Cell Types
We are assigning the following cell type definitions to the clusters identified in ADT space:

1: CD8 T Cells

3: CD4+ CD45RA+ T Cells 

4: CD4+ CD45RO+ T Cells

6: Myeloid (CD14)

6: CD15+ CD16+

7: B Cells (CD19)

8: NK Cells (CD56, CD16)

CD15 granulocyte, mcarophage monocyte

In [9]:
ct_dict = {}
ct_dict['Group 4: cat-1'] = 'Cell Type: CD8 T Cells'
ct_dict['Group 4: cat-3'] = 'Cell Type: CD4+ CD45RA+ T Cells '
ct_dict['Group 4: cat-4'] = 'Cell Type: CD4+ CD45RO+ T Cells'
ct_dict['Group 4: cat-5'] = 'Cell Type: Myeloid CD14'
ct_dict['Group 4: cat-6'] = 'Cell Type: CD15+ CD16+'
ct_dict['Group 4: cat-7'] = 'Cell Type: B Cells CD19'
ct_dict['Group 4: cat-8'] = 'Cell Type: NK Cells CD56, CD16'

### Drop outlier cat-2 single cell

In [10]:
df['adt-cat'] = net.export_df()
print(df['adt-cat'].shape)
keep_cols = [x for x in df['adt-cat'].columns.tolist() if x[1] != 'Group 4: cat-2']
df['adt-cat'] = df['adt-cat'][keep_cols]
df['adt-cat'].shape

(14, 7865)


(14, 7864)

In [11]:
df['adt-cat'].columns = [(x[0], ct_dict[x[1]]) for x in df['adt-cat'].columns.tolist()]

### Drop outlier cell from GEX Data

In [12]:
keep_barcodes = [x[0] for x in df['adt-cat'].columns.tolist()]
print(len(keep_barcodes))
keep_barcodes[:3]

7864


['AAACCCAAGATTGTGA', 'AAACCCACATCGGTTA', 'AAACCCAGTACCGCGT']

In [13]:
df['gex'] = df['gex'][keep_barcodes]

In [14]:
df['gex'].shape

(33538, 7864)

# Cell Type in ADT Space

In [15]:
net.load_df(df['adt-cat'])
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "CD3", "ini": 14, "clust": 9, "rank": 4, "rankvar": 6, "group":…

### Transfer Cell Types to GEX

In [16]:
df['gex-cat'] = deepcopy(df['gex'])
df['gex-cat'].columns = df['adt-cat'].columns

### Drop Ribosomal and Mitochondrial Genes

In [29]:
def drop_ribo_mito(df):
 all_genes = df.index.tolist()
 print(len(all_genes))
 keep_genes = [x for x in all_genes if 'RPL' not in x]
 keep_genes = [x for x in keep_genes if 'RPS' not in x]
 print(len(keep_genes))

 df = df.loc[keep_genes]
 df.shape

 # Removing Mitochondrial Genes
 list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

 all_genes = df.index.tolist()
 mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
 x.split('_')[0] in list_mito_genes]
 print(mito_genes)

 keep_genes = [x for x in all_genes if x not in mito_genes]
 df = df.loc[keep_genes]
 
 return df

In [30]:
df['gex-cat'] = drop_ribo_mito(df['gex-cat'])

33538
33346
['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L7', 'MTRNR2L5', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']


### Generate GEX DataFrame with Fewer Genes

In [35]:
for inst_top in [5000, 2500, 1000, 500, 250, 100]:
 net.load_df(df['gex-cat'])
 net.filter_N_top(inst_rc='row', N_top=inst_top, rank_type='var')
 inst_df = net.export_df()
 inst_df.columns = [str(x) for x in inst_df.columns]
 print(inst_top, inst_df.shape)
 inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/gex-cat_' + 
 str(inst_top) + '-var.parquet')

5000 (5000, 7864)
2500 (2500, 7864)
1000 (1000, 7864)
500 (500, 7864)
250 (250, 7864)
100 (100, 7864)


### Save ADT Data

In [32]:
inst_df = deepcopy(df['adt-cat'])
inst_df.columns = [str(x) for x in inst_df.columns]
inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/adt-cat.parquet')