{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.0 10K PBMC CITE-seq Assign Cell Type\n",
    "This notebook performs hierarchical clustering on antibody derived tag (ADT) data, assigns tentative cell types based on ADT clustering, transfers cell type assignment to the gene expression data (GEX), and saves the resulting labeled and filtered data to parquet files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from copy import deepcopy\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">> clustergrammer2 backend version 0.4.2\n"
     ]
    }
   ],
   "source": [
    "from clustergrammer2 import net\n",
    "import helper_functions as hf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 30.8 s, sys: 459 ms, total: 31.2 s\n",
      "Wall time: 30.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "inst_path = '../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/filtered_feature_bc_matrix/'\n",
    "feature_data = hf.load_v3_comp_sparse_feat_matrix(inst_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean ADT Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "inst_features = feature_data['adt']['features']\n",
    "keep_indexes = []\n",
    "keep_features = []\n",
    "for inst_index in range(len(inst_features)):\n",
    "    inst_feature = inst_features[inst_index]\n",
    "    if '_control' not in inst_feature:\n",
    "        keep_indexes.append(inst_index)\n",
    "        keep_features.append(inst_feature)\n",
    "\n",
    "feature_data['adt']['features'] = keep_features\n",
    "feature_data['adt']['mat'] = feature_data['adt']['mat'][keep_indexes,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gex\n",
      "33538 7865\n",
      "(33538, 7865) \n",
      "\n",
      "adt\n",
      "14 7865\n",
      "(14, 7865) \n",
      "\n"
     ]
    }
   ],
   "source": [
    "hf.check_feature_data_size(feature_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "adt (14, 7865)\n",
      "gex (33538, 7865)\n",
      "CPU times: user 385 ms, sys: 506 ms, total: 892 ms\n",
      "Wall time: 899 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df = hf.convert_feature_data_to_df_dict(feature_data, make_sparse=False)\n",
    "print('adt', df['adt'].shape)\n",
    "print('gex', df['gex'].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cluster Cells in ADT Space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['adt-ash'] = np.arcsinh(df['adt']/5)\n",
    "df['adt-ash'].shape\n",
    "df['adt-ash'].columns = [(x,) for x in df['adt-ash'].columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2ab9c36b30744fdfb0272ece9e89c6c2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"CD3\", \"ini\": 14, \"clust\": 9, \"rank\": 11, \"rankvar\": 9, \"group\"…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "net.load_df(df['adt-ash'])\n",
    "net.normalize(axis='row', norm_type='zscore')\n",
    "net.cluster()\n",
    "net.dendro_cats(axis='col', dendro_level=4)\n",
    "net.cluster()\n",
    "net.widget()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell Types\n",
    "We are assigning the following cell type definitions to the clusters identified in ADT space:\n",
    "\n",
    "1: CD8 T Cells\n",
    "\n",
    "3: CD4+ CD45RA+ T Cells \n",
    "\n",
    "4: CD4+ CD45RO+ T Cells\n",
    "\n",
    "6: Myeloid (CD14)\n",
    "\n",
    "6: CD15+ CD16+\n",
    "\n",
    "7: B Cells (CD19)\n",
    "\n",
    "8: NK Cells (CD56, CD16)\n",
    "\n",
    "CD15 granulocyte, mcarophage monocyte"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ct_dict = {}\n",
    "ct_dict['Group 4: cat-1'] = 'Cell Type: CD8 T Cells'\n",
    "ct_dict['Group 4: cat-3'] = 'Cell Type: CD4+ CD45RA+ T Cells '\n",
    "ct_dict['Group 4: cat-4'] = 'Cell Type: CD4+ CD45RO+ T Cells'\n",
    "ct_dict['Group 4: cat-5'] = 'Cell Type: Myeloid CD14'\n",
    "ct_dict['Group 4: cat-6'] = 'Cell Type: CD15+ CD16+'\n",
    "ct_dict['Group 4: cat-7'] = 'Cell Type: B Cells CD19'\n",
    "ct_dict['Group 4: cat-8'] = 'Cell Type: NK Cells CD56, CD16'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop outlier cat-2 single cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(14, 7865)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(14, 7864)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['adt-cat'] = net.export_df()\n",
    "print(df['adt-cat'].shape)\n",
    "keep_cols = [x for x in df['adt-cat'].columns.tolist() if x[1] != 'Group 4: cat-2']\n",
    "df['adt-cat'] = df['adt-cat'][keep_cols]\n",
    "df['adt-cat'].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['adt-cat'].columns = [(x[0], ct_dict[x[1]]) for x in df['adt-cat'].columns.tolist()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop outlier cell from GEX Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7864\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['AAACCCAAGATTGTGA', 'AAACCCACATCGGTTA', 'AAACCCAGTACCGCGT']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keep_barcodes = [x[0] for x in df['adt-cat'].columns.tolist()]\n",
    "print(len(keep_barcodes))\n",
    "keep_barcodes[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['gex'] = df['gex'][keep_barcodes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33538, 7864)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['gex'].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cell Type in ADT Space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b4c56593b2ba455d89f3eb8ca4ece595",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "ExampleWidget(network='{\"row_nodes\": [{\"name\": \"CD3\", \"ini\": 14, \"clust\": 9, \"rank\": 4, \"rankvar\": 6, \"group\":…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "net.load_df(df['adt-cat'])\n",
    "net.widget()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transfer Cell Types to GEX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['gex-cat'] = deepcopy(df['gex'])\n",
    "df['gex-cat'].columns = df['adt-cat'].columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop Ribosomal and Mitochondrial Genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def drop_ribo_mito(df):\n",
    "    all_genes = df.index.tolist()\n",
    "    print(len(all_genes))\n",
    "    keep_genes = [x for x in all_genes if 'RPL' not in x]\n",
    "    keep_genes = [x for x in keep_genes if 'RPS' not in x]\n",
    "    print(len(keep_genes))\n",
    "\n",
    "    df = df.loc[keep_genes]\n",
    "    df.shape\n",
    "\n",
    "    # Removing Mitochondrial Genes\n",
    "    list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',\n",
    "                    'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']\n",
    "\n",
    "    all_genes = df.index.tolist()\n",
    "    mito_genes = [x for x in all_genes if 'MT-' == x[:3] or \n",
    "                 x.split('_')[0] in list_mito_genes]\n",
    "    print(mito_genes)\n",
    "\n",
    "    keep_genes = [x for x in all_genes if x not in mito_genes]\n",
    "    df = df.loc[keep_genes]\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33538\n",
      "33346\n",
      "['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L7', 'MTRNR2L5', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']\n"
     ]
    }
   ],
   "source": [
    "df['gex-cat'] = drop_ribo_mito(df['gex-cat'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate GEX DataFrame with Fewer Genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000 (5000, 7864)\n",
      "2500 (2500, 7864)\n",
      "1000 (1000, 7864)\n",
      "500 (500, 7864)\n",
      "250 (250, 7864)\n",
      "100 (100, 7864)\n"
     ]
    }
   ],
   "source": [
    "for inst_top in [5000, 2500, 1000, 500, 250, 100]:\n",
    "    net.load_df(df['gex-cat'])\n",
    "    net.filter_N_top(inst_rc='row', N_top=inst_top, rank_type='var')\n",
    "    inst_df = net.export_df()\n",
    "    inst_df.columns = [str(x) for x in inst_df.columns]\n",
    "    print(inst_top, inst_df.shape)\n",
    "    inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/gex-cat_' + \n",
    "                        str(inst_top) + '-var.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save ADT Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "inst_df = deepcopy(df['adt-cat'])\n",
    "inst_df.columns = [str(x) for x in inst_df.columns]\n",
    "inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/adt-cat.parquet')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}