{
"cells": [
{
"cell_type": "markdown",
"id": "e3c2f575",
"metadata": {},
"source": [
"# 10X Metadata\n",
"\n",
"We combine some cell metadata with TCR data, and export the results."
]
},
{
"cell_type": "markdown",
"id": "c821828a",
"metadata": {},
"source": [
"# Loading packages and data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a4fa4e5f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scanpy==1.7.1 anndata==0.7.6 umap==0.5.1 numpy==1.20.1 scipy==1.6.1 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3\n"
]
}
],
"source": [
"import pandas as pd # Pandas for data analysis.\n",
"import numpy as np\n",
"import scipy.stats as ss\n",
"import matplotlib.pyplot as plt # For basic plotting.\n",
"import seaborn as sns # For pretty visualization in Seaborn. See https://seaborn.pydata.org/\n",
"from IPython.display import display # Pretty display of data frames.\n",
"\n",
"from sklearn import base\n",
"from sklearn.feature_selection import chi2, f_classif\n",
"\n",
"import scanpy as sc\n",
"sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)\n",
"sc.logging.print_header()\n",
"sc.settings.set_figure_params(dpi=80, facecolor='white')\n",
"\n",
"# Put plots inline rather than in a pop-up.\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0239a4d5",
"metadata": {},
"outputs": [],
"source": [
"def hrule(repchar = '=', length=80):\n",
" '''\n",
" A quick function to print a horizontal line.\n",
" '''\n",
" if len(repchar) == 1:\n",
" print(repchar*length)"
]
},
{
"cell_type": "markdown",
"id": "d04da88a",
"metadata": {},
"source": [
"First we load the TCR data for all three experiments."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1f43eeae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22435 TCR records loaded for exp1\n",
"15139 TCR records loaded for exp2\n",
"28104 TCR records loaded for exp3\n"
]
}
],
"source": [
"annot_df = {}\n",
"experiments = ['exp1', 'exp2', 'exp3']\n",
"for exp in experiments:\n",
" annot_df[exp] = pd.read_csv('Raw/Final_TCR_xls_AllExps_10x_CJAug_{}.csv'.format(exp), index_col=0)\n",
" print('{} TCR records loaded for {}'.format(len(annot_df[exp]), exp))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "733e68f2",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" clone_ID | \n",
" hash_ID | \n",
" nCount_CH | \n",
" nCount_RNA | \n",
" nFeature_CH | \n",
" nFeature_RNA | \n",
" orig_ident | \n",
" percent_mt | \n",
" clone | \n",
" well | \n",
" experiment | \n",
" CD_type | \n",
"
\n",
" \n",
" CellID | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" exp1_AAACCTGAGACAGGCT-1 | \n",
" exp1_2 | \n",
" exp1_Sample2 | \n",
" 1028.0 | \n",
" 6876.0 | \n",
" 6 | \n",
" 2669 | \n",
" exp1 | \n",
" 3.955788 | \n",
" 2 | \n",
" Sample2 | \n",
" exp1 | \n",
" CD4 | \n",
"
\n",
" \n",
" exp1_AAACCTGAGCCCAATT-1 | \n",
" exp1_non | \n",
" exp1_Sample1 | \n",
" 2166.0 | \n",
" 6246.0 | \n",
" 6 | \n",
" 2368 | \n",
" exp1 | \n",
" 2.161383 | \n",
" non | \n",
" Sample1 | \n",
" exp1 | \n",
" unknown | \n",
"
\n",
" \n",
" exp1_AAACCTGAGGACAGAA-1 | \n",
" exp1_non | \n",
" exp1_Sample6 | \n",
" 813.0 | \n",
" 4615.0 | \n",
" 6 | \n",
" 1718 | \n",
" exp1 | \n",
" 2.665222 | \n",
" non | \n",
" Sample6 | \n",
" exp1 | \n",
" unknown | \n",
"
\n",
" \n",
" exp1_AAACCTGCAACAACCT-1 | \n",
" exp1_5 | \n",
" exp1_Sample5 | \n",
" 1092.0 | \n",
" 11565.0 | \n",
" 6 | \n",
" 3613 | \n",
" exp1 | \n",
" 2.075227 | \n",
" 5 | \n",
" Sample5 | \n",
" exp1 | \n",
" CD4 | \n",
"
\n",
" \n",
" exp1_AAACCTGCAGACACTT-1 | \n",
" exp1_7 | \n",
" exp1_Sample4 | \n",
" 847.0 | \n",
" 4440.0 | \n",
" 6 | \n",
" 2005 | \n",
" exp1 | \n",
" 0.990991 | \n",
" 7 | \n",
" Sample4 | \n",
" exp1 | \n",
" CD4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" clone_ID hash_ID nCount_CH nCount_RNA \\\n",
"CellID \n",
"exp1_AAACCTGAGACAGGCT-1 exp1_2 exp1_Sample2 1028.0 6876.0 \n",
"exp1_AAACCTGAGCCCAATT-1 exp1_non exp1_Sample1 2166.0 6246.0 \n",
"exp1_AAACCTGAGGACAGAA-1 exp1_non exp1_Sample6 813.0 4615.0 \n",
"exp1_AAACCTGCAACAACCT-1 exp1_5 exp1_Sample5 1092.0 11565.0 \n",
"exp1_AAACCTGCAGACACTT-1 exp1_7 exp1_Sample4 847.0 4440.0 \n",
"\n",
" nFeature_CH nFeature_RNA orig_ident percent_mt \\\n",
"CellID \n",
"exp1_AAACCTGAGACAGGCT-1 6 2669 exp1 3.955788 \n",
"exp1_AAACCTGAGCCCAATT-1 6 2368 exp1 2.161383 \n",
"exp1_AAACCTGAGGACAGAA-1 6 1718 exp1 2.665222 \n",
"exp1_AAACCTGCAACAACCT-1 6 3613 exp1 2.075227 \n",
"exp1_AAACCTGCAGACACTT-1 6 2005 exp1 0.990991 \n",
"\n",
" clone well experiment CD_type \n",
"CellID \n",
"exp1_AAACCTGAGACAGGCT-1 2 Sample2 exp1 CD4 \n",
"exp1_AAACCTGAGCCCAATT-1 non Sample1 exp1 unknown \n",
"exp1_AAACCTGAGGACAGAA-1 non Sample6 exp1 unknown \n",
"exp1_AAACCTGCAACAACCT-1 5 Sample5 exp1 CD4 \n",
"exp1_AAACCTGCAGACACTT-1 7 Sample4 exp1 CD4 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meta_all = pd.read_csv('Processed/Allcells_metadata.csv', index_col=0)\n",
"meta_all.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e12e43a8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metadata loaded for 13493 cells in exp1\n",
"Metadata loaded for 11143 cells in exp2\n",
"Metadata loaded for 22277 cells in exp3\n"
]
}
],
"source": [
"meta_df = {}\n",
"for exp in experiments:\n",
" meta_df[exp] = meta_all[meta_all.orig_ident == exp]\n",
" print('Metadata loaded for {} cells in {}'.format(len(meta_df[exp]), exp))"
]
},
{
"cell_type": "markdown",
"id": "2d82a723",
"metadata": {},
"source": [
"These numbers of cells would be half of the number of TCR records, if each cell had one TCRA and one TCRB record. But that's not the case, and some more cleanup of data is needed. "
]
},
{
"cell_type": "markdown",
"id": "84cac1ec",
"metadata": {},
"source": [
"# Cleaning up data\n",
"\n",
"First, we remove cells from the metadata with repeated barcodes."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1588bed2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13493 rows in cell metadata. 13493 with unique barcodes\n",
"Only unique barcodes remain in exp1.\n",
"11143 rows in cell metadata. 11143 with unique barcodes\n",
"Only unique barcodes remain in exp2.\n",
"22277 rows in cell metadata. 22277 with unique barcodes\n",
"Only unique barcodes remain in exp3.\n"
]
}
],
"source": [
"for exp in experiments:\n",
" cells_meta = pd.Series([cn.split('_')[1] for cn in meta_df[exp].index])\n",
" vc = cells_meta.value_counts()\n",
" good_barcodes = vc[vc == 1].index # Omit barcodes that appear more than once.\n",
" print('{} rows in cell metadata. {} with unique barcodes'.format(len(cells_meta), len(good_barcodes)))\n",
" meta_df[exp].index = cells_meta\n",
" meta_df[exp] = meta_df[exp].loc[good_barcodes]\n",
" print('Only unique barcodes remain in {}.'.format(exp))"
]
},
{
"cell_type": "markdown",
"id": "70b1adf7",
"metadata": {},
"source": [
"This is good. No repeated barcodes within an experiment. There might be some across different experiments."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3cc5bd86",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" clone_ID | \n",
" hash_ID | \n",
" nCount_CH | \n",
" nCount_RNA | \n",
" nFeature_CH | \n",
" nFeature_RNA | \n",
" orig_ident | \n",
" percent_mt | \n",
" clone | \n",
" well | \n",
" experiment | \n",
" CD_type | \n",
"
\n",
" \n",
" \n",
" \n",
" CACAGGCCATCGGAAG-1 | \n",
" exp3_non | \n",
" exp3_Sample2 | \n",
" 3172.0 | \n",
" 4911.0 | \n",
" 6 | \n",
" 2377 | \n",
" exp3 | \n",
" 1.323559 | \n",
" non | \n",
" Sample2 | \n",
" exp3 | \n",
" unknown | \n",
"
\n",
" \n",
" ATCGAGTCACGAAAGC-1 | \n",
" exp3_4 | \n",
" exp3_Sample1 | \n",
" 901.0 | \n",
" 8349.0 | \n",
" 5 | \n",
" 3155 | \n",
" exp3 | \n",
" 1.593005 | \n",
" 4 | \n",
" Sample1 | \n",
" exp3 | \n",
" CD4 | \n",
"
\n",
" \n",
" CTAGTGAAGGGTATCG-1 | \n",
" exp3_8 | \n",
" exp3_Sample4 | \n",
" 308.0 | \n",
" 2927.0 | \n",
" 6 | \n",
" 1496 | \n",
" exp3 | \n",
" 1.947386 | \n",
" 8 | \n",
" Sample4 | \n",
" exp3 | \n",
" CD4 | \n",
"
\n",
" \n",
" TGACTAGTCATCGCTC-1 | \n",
" exp3_1 | \n",
" exp3_Sample5 | \n",
" 308.0 | \n",
" 1591.0 | \n",
" 6 | \n",
" 1100 | \n",
" exp3 | \n",
" 0.817096 | \n",
" 1 | \n",
" Sample5 | \n",
" exp3 | \n",
" CD4 | \n",
"
\n",
" \n",
" CGTCACTTCATCATTC-1 | \n",
" exp3_non | \n",
" exp3_Sample6 | \n",
" 436.0 | \n",
" 2766.0 | \n",
" 6 | \n",
" 1509 | \n",
" exp3 | \n",
" 0.795372 | \n",
" non | \n",
" Sample6 | \n",
" exp3 | \n",
" unknown | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" clone_ID hash_ID nCount_CH nCount_RNA \\\n",
"CACAGGCCATCGGAAG-1 exp3_non exp3_Sample2 3172.0 4911.0 \n",
"ATCGAGTCACGAAAGC-1 exp3_4 exp3_Sample1 901.0 8349.0 \n",
"CTAGTGAAGGGTATCG-1 exp3_8 exp3_Sample4 308.0 2927.0 \n",
"TGACTAGTCATCGCTC-1 exp3_1 exp3_Sample5 308.0 1591.0 \n",
"CGTCACTTCATCATTC-1 exp3_non exp3_Sample6 436.0 2766.0 \n",
"\n",
" nFeature_CH nFeature_RNA orig_ident percent_mt clone \\\n",
"CACAGGCCATCGGAAG-1 6 2377 exp3 1.323559 non \n",
"ATCGAGTCACGAAAGC-1 5 3155 exp3 1.593005 4 \n",
"CTAGTGAAGGGTATCG-1 6 1496 exp3 1.947386 8 \n",
"TGACTAGTCATCGCTC-1 6 1100 exp3 0.817096 1 \n",
"CGTCACTTCATCATTC-1 6 1509 exp3 0.795372 non \n",
"\n",
" well experiment CD_type \n",
"CACAGGCCATCGGAAG-1 Sample2 exp3 unknown \n",
"ATCGAGTCACGAAAGC-1 Sample1 exp3 CD4 \n",
"CTAGTGAAGGGTATCG-1 Sample4 exp3 CD4 \n",
"TGACTAGTCATCGCTC-1 Sample5 exp3 CD4 \n",
"CGTCACTTCATCATTC-1 Sample6 exp3 unknown "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meta_df[exp].head()"
]
},
{
"cell_type": "markdown",
"id": "615b91ec",
"metadata": {},
"source": [
"# Extract TCRA and TCRB data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e7dcdab2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing exp1, TRA sequences\n",
"10658 cells with at least 1 TRA sequences\n",
"542 cells with at least 2 TRA sequences\n",
"================================================================================\n",
"Processing exp1, TRB sequences\n",
"11235 cells with at least 1 TRB sequences\n",
"================================================================================\n",
"Processing exp2, TRA sequences\n",
"7175 cells with at least 1 TRA sequences\n",
"184 cells with at least 2 TRA sequences\n",
"================================================================================\n",
"Processing exp2, TRB sequences\n",
"7780 cells with at least 1 TRB sequences\n",
"================================================================================\n",
"Processing exp3, TRA sequences\n",
"12903 cells with at least 1 TRA sequences\n",
"912 cells with at least 2 TRA sequences\n",
"================================================================================\n",
"Processing exp3, TRB sequences\n",
"14289 cells with at least 1 TRB sequences\n",
"================================================================================\n"
]
}
],
"source": [
"for exp in experiments:\n",
" a_df = annot_df[exp]\n",
" for TR_chain in ['TRA', 'TRB']:\n",
" print('Processing {}, {} sequences'.format(exp, TR_chain))\n",
" TR_df = a_df[a_df.chain == TR_chain]\n",
" TR_agg = TR_df.groupby(level=0).agg(list)['cdr3'].apply(pd.Series) # Trick to aggregate with multiple TRA seqs.\n",
" #TR_agg.columns = [TR_chain + '_seq1', TR_chain + '_seq2']\n",
" for j in TR_agg.columns:\n",
" meta_df[exp][TR_chain+'_seq'+str(j+1)] = TR_agg[j]\n",
" print('{} cells with at least {} {} sequences'.format(len(TR_agg[j].dropna()), j+1, TR_chain))\n",
" hrule()\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5b0fbee9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" clone_ID | \n",
" hash_ID | \n",
" nCount_CH | \n",
" nCount_RNA | \n",
" nFeature_CH | \n",
" nFeature_RNA | \n",
" orig_ident | \n",
" percent_mt | \n",
" clone | \n",
" well | \n",
" experiment | \n",
" CD_type | \n",
" TRA_seq1 | \n",
" TRA_seq2 | \n",
" TRB_seq1 | \n",
"
\n",
" \n",
" \n",
" \n",
" CACAGGCCATCGGAAG-1 | \n",
" exp3_non | \n",
" exp3_Sample2 | \n",
" 3172.0 | \n",
" 4911.0 | \n",
" 6 | \n",
" 2377 | \n",
" exp3 | \n",
" 1.323559 | \n",
" non | \n",
" Sample2 | \n",
" exp3 | \n",
" unknown | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" ATCGAGTCACGAAAGC-1 | \n",
" exp3_4 | \n",
" exp3_Sample1 | \n",
" 901.0 | \n",
" 8349.0 | \n",
" 5 | \n",
" 3155 | \n",
" exp3 | \n",
" 1.593005 | \n",
" 4 | \n",
" Sample1 | \n",
" exp3 | \n",
" CD4 | \n",
" CAVSDRGAGGFKTIF | \n",
" CAEASLLSGTYKYIF | \n",
" CASSPRDRATGELFF | \n",
"
\n",
" \n",
" CTAGTGAAGGGTATCG-1 | \n",
" exp3_8 | \n",
" exp3_Sample4 | \n",
" 308.0 | \n",
" 2927.0 | \n",
" 6 | \n",
" 1496 | \n",
" exp3 | \n",
" 1.947386 | \n",
" 8 | \n",
" Sample4 | \n",
" exp3 | \n",
" CD4 | \n",
" CATDQAGTALIF | \n",
" NaN | \n",
" CASSLVGVGADQPQHF | \n",
"
\n",
" \n",
" TGACTAGTCATCGCTC-1 | \n",
" exp3_1 | \n",
" exp3_Sample5 | \n",
" 308.0 | \n",
" 1591.0 | \n",
" 6 | \n",
" 1100 | \n",
" exp3 | \n",
" 0.817096 | \n",
" 1 | \n",
" Sample5 | \n",
" exp3 | \n",
" CD4 | \n",
" CALSESLNNNARLMF | \n",
" NaN | \n",
" CASSEGGKSGIVYEQYF | \n",
"
\n",
" \n",
" CGTCACTTCATCATTC-1 | \n",
" exp3_non | \n",
" exp3_Sample6 | \n",
" 436.0 | \n",
" 2766.0 | \n",
" 6 | \n",
" 1509 | \n",
" exp3 | \n",
" 0.795372 | \n",
" non | \n",
" Sample6 | \n",
" exp3 | \n",
" unknown | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" ACGATGTCAGCTGTTA-1 | \n",
" exp3_3 | \n",
" exp3_Sample4 | \n",
" 249.0 | \n",
" 2709.0 | \n",
" 6 | \n",
" 1566 | \n",
" exp3 | \n",
" 1.291990 | \n",
" 3 | \n",
" Sample4 | \n",
" exp3 | \n",
" CD4 | \n",
" CAMSSGGSNYKLTF | \n",
" NaN | \n",
" CSASSGIQPQHF | \n",
"
\n",
" \n",
" TGAGGGAAGTGGGTTG-1 | \n",
" exp3_8 | \n",
" exp3_Sample3 | \n",
" 723.0 | \n",
" 3927.0 | \n",
" 6 | \n",
" 1724 | \n",
" exp3 | \n",
" 0.713012 | \n",
" 8 | \n",
" Sample3 | \n",
" exp3 | \n",
" CD4 | \n",
" CATDQAGTALIF | \n",
" NaN | \n",
" CASSLVGVGADQPQHF | \n",
"
\n",
" \n",
" GAATGAATCTGTCAAG-1 | \n",
" exp3_non | \n",
" exp3_Sample2 | \n",
" 1554.0 | \n",
" 1569.0 | \n",
" 5 | \n",
" 1133 | \n",
" exp3 | \n",
" 5.544933 | \n",
" non | \n",
" Sample2 | \n",
" exp3 | \n",
" unknown | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" GTAACGTTCACAAACC-1 | \n",
" exp3_non | \n",
" exp3_Sample1 | \n",
" 585.0 | \n",
" 873.0 | \n",
" 6 | \n",
" 663 | \n",
" exp3 | \n",
" 8.820160 | \n",
" non | \n",
" Sample1 | \n",
" exp3 | \n",
" unknown | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" CGAATGTTCAGTTGAC-1 | \n",
" exp3_12 | \n",
" exp3_Sample2 | \n",
" 2406.0 | \n",
" 5952.0 | \n",
" 6 | \n",
" 2688 | \n",
" exp3 | \n",
" 1.108871 | \n",
" 12 | \n",
" Sample2 | \n",
" exp3 | \n",
" CD4 | \n",
" CILRGYTGNQFYF | \n",
" NaN | \n",
" CASSPNRDVGSGYTF | \n",
"
\n",
" \n",
"
\n",
"
22277 rows × 15 columns
\n",
"
"
],
"text/plain": [
" clone_ID hash_ID nCount_CH nCount_RNA \\\n",
"CACAGGCCATCGGAAG-1 exp3_non exp3_Sample2 3172.0 4911.0 \n",
"ATCGAGTCACGAAAGC-1 exp3_4 exp3_Sample1 901.0 8349.0 \n",
"CTAGTGAAGGGTATCG-1 exp3_8 exp3_Sample4 308.0 2927.0 \n",
"TGACTAGTCATCGCTC-1 exp3_1 exp3_Sample5 308.0 1591.0 \n",
"CGTCACTTCATCATTC-1 exp3_non exp3_Sample6 436.0 2766.0 \n",
"... ... ... ... ... \n",
"ACGATGTCAGCTGTTA-1 exp3_3 exp3_Sample4 249.0 2709.0 \n",
"TGAGGGAAGTGGGTTG-1 exp3_8 exp3_Sample3 723.0 3927.0 \n",
"GAATGAATCTGTCAAG-1 exp3_non exp3_Sample2 1554.0 1569.0 \n",
"GTAACGTTCACAAACC-1 exp3_non exp3_Sample1 585.0 873.0 \n",
"CGAATGTTCAGTTGAC-1 exp3_12 exp3_Sample2 2406.0 5952.0 \n",
"\n",
" nFeature_CH nFeature_RNA orig_ident percent_mt clone \\\n",
"CACAGGCCATCGGAAG-1 6 2377 exp3 1.323559 non \n",
"ATCGAGTCACGAAAGC-1 5 3155 exp3 1.593005 4 \n",
"CTAGTGAAGGGTATCG-1 6 1496 exp3 1.947386 8 \n",
"TGACTAGTCATCGCTC-1 6 1100 exp3 0.817096 1 \n",
"CGTCACTTCATCATTC-1 6 1509 exp3 0.795372 non \n",
"... ... ... ... ... ... \n",
"ACGATGTCAGCTGTTA-1 6 1566 exp3 1.291990 3 \n",
"TGAGGGAAGTGGGTTG-1 6 1724 exp3 0.713012 8 \n",
"GAATGAATCTGTCAAG-1 5 1133 exp3 5.544933 non \n",
"GTAACGTTCACAAACC-1 6 663 exp3 8.820160 non \n",
"CGAATGTTCAGTTGAC-1 6 2688 exp3 1.108871 12 \n",
"\n",
" well experiment CD_type TRA_seq1 \\\n",
"CACAGGCCATCGGAAG-1 Sample2 exp3 unknown NaN \n",
"ATCGAGTCACGAAAGC-1 Sample1 exp3 CD4 CAVSDRGAGGFKTIF \n",
"CTAGTGAAGGGTATCG-1 Sample4 exp3 CD4 CATDQAGTALIF \n",
"TGACTAGTCATCGCTC-1 Sample5 exp3 CD4 CALSESLNNNARLMF \n",
"CGTCACTTCATCATTC-1 Sample6 exp3 unknown NaN \n",
"... ... ... ... ... \n",
"ACGATGTCAGCTGTTA-1 Sample4 exp3 CD4 CAMSSGGSNYKLTF \n",
"TGAGGGAAGTGGGTTG-1 Sample3 exp3 CD4 CATDQAGTALIF \n",
"GAATGAATCTGTCAAG-1 Sample2 exp3 unknown NaN \n",
"GTAACGTTCACAAACC-1 Sample1 exp3 unknown NaN \n",
"CGAATGTTCAGTTGAC-1 Sample2 exp3 CD4 CILRGYTGNQFYF \n",
"\n",
" TRA_seq2 TRB_seq1 \n",
"CACAGGCCATCGGAAG-1 NaN NaN \n",
"ATCGAGTCACGAAAGC-1 CAEASLLSGTYKYIF CASSPRDRATGELFF \n",
"CTAGTGAAGGGTATCG-1 NaN CASSLVGVGADQPQHF \n",
"TGACTAGTCATCGCTC-1 NaN CASSEGGKSGIVYEQYF \n",
"CGTCACTTCATCATTC-1 NaN NaN \n",
"... ... ... \n",
"ACGATGTCAGCTGTTA-1 NaN CSASSGIQPQHF \n",
"TGAGGGAAGTGGGTTG-1 NaN CASSLVGVGADQPQHF \n",
"GAATGAATCTGTCAAG-1 NaN NaN \n",
"GTAACGTTCACAAACC-1 NaN NaN \n",
"CGAATGTTCAGTTGAC-1 NaN CASSPNRDVGSGYTF \n",
"\n",
"[22277 rows x 15 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meta_df[exp]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8402263f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking exp1\n",
"3 cells with no clone, having TCRA or TCRB\n",
"122 cells with clone, but no TCRA and no TCRB\n",
"================================================================================\n",
"Checking exp2\n",
"0 cells with no clone, having TCRA or TCRB\n",
"182 cells with clone, but no TCRA and no TCRB\n",
"================================================================================\n",
"Checking exp3\n",
"1 cells with no clone, having TCRA or TCRB\n",
"106 cells with clone, but no TCRA and no TCRB\n",
"================================================================================\n"
]
}
],
"source": [
"for exp in experiments:\n",
" print('Checking {}'.format(exp))\n",
" m_df = meta_df[exp]\n",
" no_clone = m_df.clone_ID == exp+'_non'\n",
" no_TCRA = m_df.TRA_seq1.isna()\n",
" no_TCRB = m_df.TRB_seq1.isna()\n",
" print('{} cells with no clone, having TCRA or TCRB'.format((no_clone & ((~no_TCRA) | (~no_TCRB)) ).sum()))\n",
" print('{} cells with clone, but no TCRA and no TCRB'.format( ((~no_clone) & no_TCRA & no_TCRB).sum() ))\n",
" hrule()"
]
},
{
"cell_type": "markdown",
"id": "c2b9a84e",
"metadata": {},
"source": [
"That's not perfect... but not so bad I think!"
]
},
{
"cell_type": "markdown",
"id": "4fe97e9a",
"metadata": {},
"source": [
"# Saving data!"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f813da6e",
"metadata": {},
"outputs": [],
"source": [
"for exp in experiments:\n",
" meta_df[exp].to_csv('Processed/metadata_withTR_{}.csv'.format(exp))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "351039ea",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}