{
"cells": [
{
"cell_type": "markdown",
"id": "8c81ef75-6ba8-4ad7-b023-defcd7b22887",
"metadata": {},
"source": [
"# Pseudobulk data with fraction and median non-zero\n",
"\n",
"For use in plotting, we'll compute \"pseudobulk\" summary statistics for gene expression based on cell metadata grouping.\n",
"\n",
"Note that this approach doesn't use the careful pseudobulk approaches implemented in packages like `scran` - we're instead taking the fraction of cells in which gene expression was detected (`pct_exp`) and the median of expression in those non-zero cells (`median_expression`).\n",
"\n",
"For use in our visualization tools, we'll group by each of our cell type labeling levels (AIFI_L1, _L2, and _L3), as well as by both cell type and the originating, age-related cohort for these cells."
]
},
{
"cell_type": "markdown",
"id": "397d8491-1006-400e-b28a-f78b98f9a632",
"metadata": {},
"source": [
"## Load packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2b94a539-8d6b-483b-93c1-d6e5ac4893f1",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"warnings.simplefilter(action='ignore', category=RuntimeWarning)\n",
"\n",
"from datetime import date\n",
"import hisepy\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"import pickle\n",
"import re\n",
"import scanpy as sc\n",
"import scipy.sparse as scs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3f700a4a-402c-483f-a13d-7376b31f29a7",
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists('output'):\n",
" os.mkdir('output')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c5fdbbf7-7c03-468e-b443-dde31833164e",
"metadata": {},
"outputs": [],
"source": [
"out_files = []"
]
},
{
"cell_type": "markdown",
"id": "61b7c6d3-fbd8-4e52-b53b-cc06ebb0f175",
"metadata": {},
"source": [
"## Helper functions"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b1863da8-23d2-4b2f-80aa-fa5948276a23",
"metadata": {},
"outputs": [],
"source": [
"def read_adata_uuid(h5ad_uuid):\n",
" h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)\n",
" if not os.path.isdir(h5ad_path):\n",
" hise_res = hisepy.cache_files([h5ad_uuid])\n",
" h5ad_filename = os.listdir(h5ad_path)[0]\n",
" h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)\n",
" adata = sc.read_h5ad(h5ad_file)\n",
" return adata"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "90ce8067-33c3-4ce2-81a3-9bd69acf4566",
"metadata": {},
"outputs": [],
"source": [
"def sparse_nz_median(sparse_mat, feat_names):\n",
" # transpose the matrix so values for each gene are together in memory\n",
" sparse_mat = sparse_mat.transpose().tocsr()\n",
" \n",
" # get dimensions for calculations\n",
" n_cells = sparse_mat.shape[1]\n",
" \n",
" # compute the fraction non-zero per gene (row = axis 1)\n",
" pct_exp = sparse_mat.getnnz(axis = 1) / n_cells\n",
" \n",
" # split non-zero values from the matrix directly\n",
" split_values = np.split(sparse_mat.data, sparse_mat.indptr)\n",
" \n",
" # remove first and last entry in split - indptr starts with 0 and ends with the last value\n",
" # so the first and last split entries are not meaningful\n",
" split_values = split_values[1:-1].copy()\n",
" \n",
" # compute medians for every split\n",
" median_expression = [np.median(x) for x in split_values]\n",
"\n",
" # assemble a DataFrame with results\n",
" res_df = pd.DataFrame({\n",
" 'gene': feat_names,\n",
" 'pct_exp': pct_exp,\n",
" 'median_expression': median_expression\n",
" })\n",
" \n",
" # replace missing values from splits without any values\n",
" # these were not detected so median expression is 0\n",
" res_df['median_expression'] = res_df['median_expression'].fillna(0)\n",
" \n",
" return res_df"
]
},
{
"cell_type": "markdown",
"id": "3e82c8a8-1ea0-404d-af6b-daaadf67a5c2",
"metadata": {},
"source": [
"## Retrieve atlas data from HISE"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ea3fa84a-b9df-412d-895c-aad9023fb147",
"metadata": {},
"outputs": [],
"source": [
"h5ad_uuid = '5b3a0cc8-1811-4126-90c7-e9cdd41459fd'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "afab8559-2f7d-4757-a1c3-7ae0c57a5589",
"metadata": {},
"outputs": [],
"source": [
"adata = read_adata_uuid(h5ad_uuid)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7d596218-3a23-4075-82c7-d3426d7c7327",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1823666, 33538)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adata.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2e62ac5a-0bc9-4dee-a6f3-e006418e54e7",
"metadata": {},
"outputs": [],
"source": [
"obs = adata.obs"
]
},
{
"cell_type": "markdown",
"id": "fd3a8bc6-2f61-4362-9ef7-7f2aee408b59",
"metadata": {},
"source": [
"### Summarized expression by cell type\n",
"\n",
"For our UMAP viewer, we plot plot expression at each level of cell type resolution"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3e7d51f6-a36e-49e4-85bf-f49ae89b5f29",
"metadata": {},
"outputs": [],
"source": [
"levels = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b8948d6b-c3fb-406c-b492-d34698d9a6c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
">>> AIFI_L1\n",
">>> AIFI_L2\n",
">>> AIFI_L3\n",
"CPU times: user 7min 2s, sys: 1min 29s, total: 8min 31s\n",
"Wall time: 8min 31s\n"
]
}
],
"source": [
"%%time\n",
"level_results = {}\n",
"\n",
"for level in levels:\n",
" print('>>> {l}'.format(l = level))\n",
" \n",
" level_obs = obs.groupby(level)\n",
" \n",
" level_dict = {}\n",
" for cell_type, type_obs in level_obs:\n",
" type_bc = type_obs['barcodes']\n",
" type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n",
" \n",
" type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n",
" \n",
" level_dict[cell_type] = type_res\n",
" \n",
" level_results[level] = level_dict"
]
},
{
"cell_type": "markdown",
"id": "fc4f03d7-c118-41e3-bc33-f5dd9e3913e6",
"metadata": {},
"source": [
"### Save assembled results for each level"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "10e79d57-408b-4fc4-b704-bca00e342c23",
"metadata": {},
"outputs": [],
"source": [
"level_combined = {}\n",
"for level in levels:\n",
" all_level = pd.concat(level_results[level])\n",
" all_level = all_level.reset_index(drop = False, names = [level, 'row_num'])\n",
" all_level = all_level.drop('row_num', axis = 1)\n",
" level_combined[level] = all_level"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "719f8a2c-4a65-4aa5-a29a-8534ea025405",
"metadata": {},
"outputs": [],
"source": [
"for level, df in level_combined.items():\n",
" out_csv = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())\n",
" df.to_csv(out_csv)\n",
" out_files.append(out_csv)\n",
" out_parquet = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())\n",
" df.to_parquet(out_parquet)\n",
" out_files.append(out_parquet)"
]
},
{
"cell_type": "markdown",
"id": "51da7b67-9f3e-45c8-b2de-eb0368452ec6",
"metadata": {},
"source": [
"### Restructure for use in visualization apps"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b6231894-f906-4a06-a059-e3022bdca9cd",
"metadata": {},
"outputs": [],
"source": [
"gene_results = {}\n",
"for level in levels:\n",
" all_res = pd.concat(level_results[level])\n",
" split_res = all_res.groupby('gene')\n",
" split_dict = {}\n",
" for gene, df in split_res:\n",
" df = df.reset_index(drop = False)\n",
" df = df.drop('level_1', axis = 1)\n",
" df = df.rename({'level_0': 'obs_group'}, axis = 1)\n",
" split_dict[gene] = df\n",
" \n",
" gene_results[level] = split_dict"
]
},
{
"cell_type": "markdown",
"id": "b2b6f5a3-754b-49fc-8ce5-3764482f5cba",
"metadata": {},
"source": [
"Check that a gene or two look accurate"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "52457d5c-cd46-4903-b399-64dfbe73868b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" obs_group | \n",
" gene | \n",
" pct_exp | \n",
" median_expression | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" B cell | \n",
" CD79A | \n",
" 0.988365 | \n",
" 2.726919 | \n",
"
\n",
" \n",
" 1 | \n",
" DC | \n",
" CD79A | \n",
" 0.079959 | \n",
" 0.725910 | \n",
"
\n",
" \n",
" 2 | \n",
" Erythrocyte | \n",
" CD79A | \n",
" 0.013926 | \n",
" 1.230366 | \n",
"
\n",
" \n",
" 3 | \n",
" ILC | \n",
" CD79A | \n",
" 0.120853 | \n",
" 1.526564 | \n",
"
\n",
" \n",
" 4 | \n",
" Monocyte | \n",
" CD79A | \n",
" 0.017343 | \n",
" 0.818575 | \n",
"
\n",
" \n",
" 5 | \n",
" NK cell | \n",
" CD79A | \n",
" 0.019809 | \n",
" 1.260133 | \n",
"
\n",
" \n",
" 6 | \n",
" Platelet | \n",
" CD79A | \n",
" 0.011135 | \n",
" 2.423751 | \n",
"
\n",
" \n",
" 7 | \n",
" Progenitor cell | \n",
" CD79A | \n",
" 0.150721 | \n",
" 0.677979 | \n",
"
\n",
" \n",
" 8 | \n",
" T cell | \n",
" CD79A | \n",
" 0.034470 | \n",
" 1.093967 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" obs_group gene pct_exp median_expression\n",
"0 B cell CD79A 0.988365 2.726919\n",
"1 DC CD79A 0.079959 0.725910\n",
"2 Erythrocyte CD79A 0.013926 1.230366\n",
"3 ILC CD79A 0.120853 1.526564\n",
"4 Monocyte CD79A 0.017343 0.818575\n",
"5 NK cell CD79A 0.019809 1.260133\n",
"6 Platelet CD79A 0.011135 2.423751\n",
"7 Progenitor cell CD79A 0.150721 0.677979\n",
"8 T cell CD79A 0.034470 1.093967"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gene_results['AIFI_L1']['CD79A']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "90062555-5530-415a-9027-13a7b251246f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" obs_group | \n",
" gene | \n",
" pct_exp | \n",
" median_expression | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ASDC | \n",
" FCN1 | \n",
" 0.055556 | \n",
" 0.595097 | \n",
"
\n",
" \n",
" 1 | \n",
" CD8aa | \n",
" FCN1 | \n",
" 0.035210 | \n",
" 1.298121 | \n",
"
\n",
" \n",
" 2 | \n",
" CD14 monocyte | \n",
" FCN1 | \n",
" 0.996254 | \n",
" 3.177890 | \n",
"
\n",
" \n",
" 3 | \n",
" CD16 monocyte | \n",
" FCN1 | \n",
" 0.933907 | \n",
" 2.601019 | \n",
"
\n",
" \n",
" 4 | \n",
" CD56bright NK cell | \n",
" FCN1 | \n",
" 0.048847 | \n",
" 1.143188 | \n",
"
\n",
" \n",
" 5 | \n",
" CD56dim NK cell | \n",
" FCN1 | \n",
" 0.049148 | \n",
" 1.298217 | \n",
"
\n",
" \n",
" 6 | \n",
" DN T cell | \n",
" FCN1 | \n",
" 0.054066 | \n",
" 1.043778 | \n",
"
\n",
" \n",
" 7 | \n",
" Effector B cell | \n",
" FCN1 | \n",
" 0.052255 | \n",
" 0.973340 | \n",
"
\n",
" \n",
" 8 | \n",
" Erythrocyte | \n",
" FCN1 | \n",
" 0.070955 | \n",
" 1.265223 | \n",
"
\n",
" \n",
" 9 | \n",
" ILC | \n",
" FCN1 | \n",
" 0.046209 | \n",
" 1.412349 | \n",
"
\n",
" \n",
" 10 | \n",
" Intermediate monocyte | \n",
" FCN1 | \n",
" 0.997159 | \n",
" 3.025013 | \n",
"
\n",
" \n",
" 11 | \n",
" MAIT | \n",
" FCN1 | \n",
" 0.048519 | \n",
" 1.132813 | \n",
"
\n",
" \n",
" 12 | \n",
" Memory B cell | \n",
" FCN1 | \n",
" 0.048281 | \n",
" 1.045450 | \n",
"
\n",
" \n",
" 13 | \n",
" Memory CD4 T cell | \n",
" FCN1 | \n",
" 0.049716 | \n",
" 1.050017 | \n",
"
\n",
" \n",
" 14 | \n",
" Memory CD8 T cell | \n",
" FCN1 | \n",
" 0.048406 | \n",
" 1.210995 | \n",
"
\n",
" \n",
" 15 | \n",
" Naive B cell | \n",
" FCN1 | \n",
" 0.041079 | \n",
" 1.271097 | \n",
"
\n",
" \n",
" 16 | \n",
" Naive CD4 T cell | \n",
" FCN1 | \n",
" 0.047838 | \n",
" 1.111647 | \n",
"
\n",
" \n",
" 17 | \n",
" Naive CD8 T cell | \n",
" FCN1 | \n",
" 0.044030 | \n",
" 1.023475 | \n",
"
\n",
" \n",
" 18 | \n",
" Plasma cell | \n",
" FCN1 | \n",
" 0.103208 | \n",
" 0.351336 | \n",
"
\n",
" \n",
" 19 | \n",
" Platelet | \n",
" FCN1 | \n",
" 0.053651 | \n",
" 2.532171 | \n",
"
\n",
" \n",
" 20 | \n",
" Progenitor cell | \n",
" FCN1 | \n",
" 0.053735 | \n",
" 0.564411 | \n",
"
\n",
" \n",
" 21 | \n",
" Proliferating NK cell | \n",
" FCN1 | \n",
" 0.054159 | \n",
" 0.986173 | \n",
"
\n",
" \n",
" 22 | \n",
" Proliferating T cell | \n",
" FCN1 | \n",
" 0.064655 | \n",
" 0.571081 | \n",
"
\n",
" \n",
" 23 | \n",
" Transitional B cell | \n",
" FCN1 | \n",
" 0.042692 | \n",
" 1.196304 | \n",
"
\n",
" \n",
" 24 | \n",
" Treg | \n",
" FCN1 | \n",
" 0.048686 | \n",
" 1.184835 | \n",
"
\n",
" \n",
" 25 | \n",
" cDC1 | \n",
" FCN1 | \n",
" 0.083775 | \n",
" 0.453340 | \n",
"
\n",
" \n",
" 26 | \n",
" cDC2 | \n",
" FCN1 | \n",
" 0.796488 | \n",
" 2.183593 | \n",
"
\n",
" \n",
" 27 | \n",
" gdT | \n",
" FCN1 | \n",
" 0.046415 | \n",
" 1.194536 | \n",
"
\n",
" \n",
" 28 | \n",
" pDC | \n",
" FCN1 | \n",
" 0.076183 | \n",
" 0.794052 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" obs_group gene pct_exp median_expression\n",
"0 ASDC FCN1 0.055556 0.595097\n",
"1 CD8aa FCN1 0.035210 1.298121\n",
"2 CD14 monocyte FCN1 0.996254 3.177890\n",
"3 CD16 monocyte FCN1 0.933907 2.601019\n",
"4 CD56bright NK cell FCN1 0.048847 1.143188\n",
"5 CD56dim NK cell FCN1 0.049148 1.298217\n",
"6 DN T cell FCN1 0.054066 1.043778\n",
"7 Effector B cell FCN1 0.052255 0.973340\n",
"8 Erythrocyte FCN1 0.070955 1.265223\n",
"9 ILC FCN1 0.046209 1.412349\n",
"10 Intermediate monocyte FCN1 0.997159 3.025013\n",
"11 MAIT FCN1 0.048519 1.132813\n",
"12 Memory B cell FCN1 0.048281 1.045450\n",
"13 Memory CD4 T cell FCN1 0.049716 1.050017\n",
"14 Memory CD8 T cell FCN1 0.048406 1.210995\n",
"15 Naive B cell FCN1 0.041079 1.271097\n",
"16 Naive CD4 T cell FCN1 0.047838 1.111647\n",
"17 Naive CD8 T cell FCN1 0.044030 1.023475\n",
"18 Plasma cell FCN1 0.103208 0.351336\n",
"19 Platelet FCN1 0.053651 2.532171\n",
"20 Progenitor cell FCN1 0.053735 0.564411\n",
"21 Proliferating NK cell FCN1 0.054159 0.986173\n",
"22 Proliferating T cell FCN1 0.064655 0.571081\n",
"23 Transitional B cell FCN1 0.042692 1.196304\n",
"24 Treg FCN1 0.048686 1.184835\n",
"25 cDC1 FCN1 0.083775 0.453340\n",
"26 cDC2 FCN1 0.796488 2.183593\n",
"27 gdT FCN1 0.046415 1.194536\n",
"28 pDC FCN1 0.076183 0.794052"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gene_results['AIFI_L2']['FCN1']"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e7922203-d2b9-4cda-973d-19b784ec82db",
"metadata": {},
"outputs": [],
"source": [
"type_pkl = 'output/pbmc-ref_type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())\n",
"with open(type_pkl, 'wb') as out_file:\n",
" pickle.dump(gene_results, out_file)\n",
"out_files.append(type_pkl)"
]
},
{
"cell_type": "markdown",
"id": "966f7334-61fd-44f5-a4ea-0a1c8b00dda5",
"metadata": {},
"source": [
"### Summarized expression by cell type and cohort\n",
"\n",
"For our Gene Expression Summary viewer, we plot plot expression at each level of cell type resolution partitioned by each cohort in the reference."
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d75604e4-1e6e-4cc3-8cb6-87d10ba420e7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
">>> AIFI_L1\n",
">>> AIFI_L2\n",
">>> AIFI_L3\n",
"CPU times: user 10min 21s, sys: 1min 33s, total: 11min 55s\n",
"Wall time: 11min 55s\n"
]
}
],
"source": [
"%%time\n",
"level_results = {}\n",
"\n",
"for level in levels:\n",
" print('>>> {l}'.format(l = level))\n",
" \n",
" level_obs = obs.groupby(['cohort.cohortGuid', level])\n",
" \n",
" level_dict = {}\n",
" for group_tuple, type_obs in level_obs:\n",
" cohort = group_tuple[0]\n",
" cell_type = group_tuple[1]\n",
" c_t = '{c}_{t}'.format(c = cohort, t = cell_type)\n",
" \n",
" type_bc = type_obs['barcodes']\n",
" type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n",
" \n",
" type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n",
" \n",
" type_res['cohort'] = [cohort] * type_res.shape[0]\n",
" level_dict[c_t] = type_res\n",
" \n",
" level_results[level] = level_dict"
]
},
{
"cell_type": "markdown",
"id": "51c7c28a-edac-4156-b9c5-91c113f7785f",
"metadata": {},
"source": [
"### Save assembled results for each level"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "95e62b72-bf5c-45d9-9050-8e6a583d775f",
"metadata": {},
"outputs": [],
"source": [
"level_combined = {}\n",
"for level in levels:\n",
" all_level = pd.concat(level_results[level])\n",
" all_level = all_level.reset_index(drop = False, names = ['group', 'row_num'])\n",
" all_level[level] = [re.sub('.+_', '', x) for x in all_level['group']]\n",
" all_level = all_level.drop(['group','row_num'], axis = 1)\n",
" all_level = all_level[['cohort', level, 'gene', 'pct_exp', 'median_expression']]\n",
" level_combined[level] = all_level"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "7a667426-fdf0-4491-b29b-30857455606b",
"metadata": {},
"outputs": [],
"source": [
"for level, df in level_combined.items():\n",
" out_csv = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())\n",
" df.to_csv(out_csv)\n",
" out_files.append(out_csv)\n",
" out_parquet = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())\n",
" df.to_parquet(out_parquet)\n",
" out_files.append(out_parquet)"
]
},
{
"cell_type": "markdown",
"id": "4ffa5da8-6c12-40d9-86e8-99483e2af242",
"metadata": {},
"source": [
"### Restructure for use in visualization apps"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "63db080d-2695-454b-a814-b4c348c3e657",
"metadata": {},
"outputs": [],
"source": [
"gene_results = {}\n",
"for level in levels:\n",
" all_res = pd.concat(level_results[level])\n",
" split_res = all_res.groupby('gene')\n",
" \n",
" split_dict = {}\n",
" for gene, df in split_res:\n",
" df = df.reset_index(drop = False)\n",
" df = df.drop('level_1', axis = 1)\n",
" df['obs_group'] = [re.sub('.+_', '', x) for x in df['level_0']]\n",
" df = df.drop('level_0', axis = 1)\n",
" df = df[['obs_group', 'gene', 'pct_exp', 'median_expression', 'cohort']]\n",
" split_dict[gene] = df\n",
" gene_results[level] = split_dict"
]
},
{
"cell_type": "markdown",
"id": "5780d759-12e8-476b-8663-2ece3ef0de69",
"metadata": {},
"source": [
"Check that a gene or two look accurate"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f5837cd7-20c3-4faa-ac69-8783c9811f80",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" obs_group | \n",
" gene | \n",
" pct_exp | \n",
" median_expression | \n",
" cohort | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" B cell | \n",
" CD79A | \n",
" 0.990822 | \n",
" 2.746202 | \n",
" BR1 | \n",
"
\n",
" \n",
" 1 | \n",
" DC | \n",
" CD79A | \n",
" 0.087709 | \n",
" 0.714458 | \n",
" BR1 | \n",
"
\n",
" \n",
" 2 | \n",
" Erythrocyte | \n",
" CD79A | \n",
" 0.020000 | \n",
" 2.958432 | \n",
" BR1 | \n",
"
\n",
" \n",
" 3 | \n",
" ILC | \n",
" CD79A | \n",
" 0.119444 | \n",
" 1.399105 | \n",
" BR1 | \n",
"
\n",
" \n",
" 4 | \n",
" Monocyte | \n",
" CD79A | \n",
" 0.018308 | \n",
" 0.798966 | \n",
" BR1 | \n",
"
\n",
" \n",
" 5 | \n",
" NK cell | \n",
" CD79A | \n",
" 0.020623 | \n",
" 1.232686 | \n",
" BR1 | \n",
"
\n",
" \n",
" 6 | \n",
" Platelet | \n",
" CD79A | \n",
" 0.011520 | \n",
" 2.419073 | \n",
" BR1 | \n",
"
\n",
" \n",
" 7 | \n",
" Progenitor cell | \n",
" CD79A | \n",
" 0.157895 | \n",
" 0.659168 | \n",
" BR1 | \n",
"
\n",
" \n",
" 8 | \n",
" T cell | \n",
" CD79A | \n",
" 0.041634 | \n",
" 1.068730 | \n",
" BR1 | \n",
"
\n",
" \n",
" 9 | \n",
" B cell | \n",
" CD79A | \n",
" 0.988087 | \n",
" 2.716038 | \n",
" BR2 | \n",
"
\n",
" \n",
" 10 | \n",
" DC | \n",
" CD79A | \n",
" 0.072828 | \n",
" 0.710909 | \n",
" BR2 | \n",
"
\n",
" \n",
" 11 | \n",
" Erythrocyte | \n",
" CD79A | \n",
" 0.007937 | \n",
" 1.125242 | \n",
" BR2 | \n",
"
\n",
" \n",
" 12 | \n",
" ILC | \n",
" CD79A | \n",
" 0.113139 | \n",
" 1.515654 | \n",
" BR2 | \n",
"
\n",
" \n",
" 13 | \n",
" Monocyte | \n",
" CD79A | \n",
" 0.016689 | \n",
" 0.844489 | \n",
" BR2 | \n",
"
\n",
" \n",
" 14 | \n",
" NK cell | \n",
" CD79A | \n",
" 0.019250 | \n",
" 1.271189 | \n",
" BR2 | \n",
"
\n",
" \n",
" 15 | \n",
" Platelet | \n",
" CD79A | \n",
" 0.009580 | \n",
" 2.481542 | \n",
" BR2 | \n",
"
\n",
" \n",
" 16 | \n",
" Progenitor cell | \n",
" CD79A | \n",
" 0.128664 | \n",
" 0.651074 | \n",
" BR2 | \n",
"
\n",
" \n",
" 17 | \n",
" T cell | \n",
" CD79A | \n",
" 0.023565 | \n",
" 1.097746 | \n",
" BR2 | \n",
"
\n",
" \n",
" 18 | \n",
" B cell | \n",
" CD79A | \n",
" 0.983492 | \n",
" 2.706248 | \n",
" UP1 | \n",
"
\n",
" \n",
" 19 | \n",
" DC | \n",
" CD79A | \n",
" 0.077181 | \n",
" 0.821561 | \n",
" UP1 | \n",
"
\n",
" \n",
" 20 | \n",
" Erythrocyte | \n",
" CD79A | \n",
" 0.015625 | \n",
" 1.007309 | \n",
" UP1 | \n",
"
\n",
" \n",
" 21 | \n",
" ILC | \n",
" CD79A | \n",
" 0.133333 | \n",
" 1.595389 | \n",
" UP1 | \n",
"
\n",
" \n",
" 22 | \n",
" Monocyte | \n",
" CD79A | \n",
" 0.015871 | \n",
" 0.794617 | \n",
" UP1 | \n",
"
\n",
" \n",
" 23 | \n",
" NK cell | \n",
" CD79A | \n",
" 0.018834 | \n",
" 1.324052 | \n",
" UP1 | \n",
"
\n",
" \n",
" 24 | \n",
" Platelet | \n",
" CD79A | \n",
" 0.015686 | \n",
" 2.211980 | \n",
" UP1 | \n",
"
\n",
" \n",
" 25 | \n",
" Progenitor cell | \n",
" CD79A | \n",
" 0.198830 | \n",
" 0.790927 | \n",
" UP1 | \n",
"
\n",
" \n",
" 26 | \n",
" T cell | \n",
" CD79A | \n",
" 0.042297 | \n",
" 1.163522 | \n",
" UP1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" obs_group gene pct_exp median_expression cohort\n",
"0 B cell CD79A 0.990822 2.746202 BR1\n",
"1 DC CD79A 0.087709 0.714458 BR1\n",
"2 Erythrocyte CD79A 0.020000 2.958432 BR1\n",
"3 ILC CD79A 0.119444 1.399105 BR1\n",
"4 Monocyte CD79A 0.018308 0.798966 BR1\n",
"5 NK cell CD79A 0.020623 1.232686 BR1\n",
"6 Platelet CD79A 0.011520 2.419073 BR1\n",
"7 Progenitor cell CD79A 0.157895 0.659168 BR1\n",
"8 T cell CD79A 0.041634 1.068730 BR1\n",
"9 B cell CD79A 0.988087 2.716038 BR2\n",
"10 DC CD79A 0.072828 0.710909 BR2\n",
"11 Erythrocyte CD79A 0.007937 1.125242 BR2\n",
"12 ILC CD79A 0.113139 1.515654 BR2\n",
"13 Monocyte CD79A 0.016689 0.844489 BR2\n",
"14 NK cell CD79A 0.019250 1.271189 BR2\n",
"15 Platelet CD79A 0.009580 2.481542 BR2\n",
"16 Progenitor cell CD79A 0.128664 0.651074 BR2\n",
"17 T cell CD79A 0.023565 1.097746 BR2\n",
"18 B cell CD79A 0.983492 2.706248 UP1\n",
"19 DC CD79A 0.077181 0.821561 UP1\n",
"20 Erythrocyte CD79A 0.015625 1.007309 UP1\n",
"21 ILC CD79A 0.133333 1.595389 UP1\n",
"22 Monocyte CD79A 0.015871 0.794617 UP1\n",
"23 NK cell CD79A 0.018834 1.324052 UP1\n",
"24 Platelet CD79A 0.015686 2.211980 UP1\n",
"25 Progenitor cell CD79A 0.198830 0.790927 UP1\n",
"26 T cell CD79A 0.042297 1.163522 UP1"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gene_results['AIFI_L1']['CD79A']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "3940006b-a4f2-4649-8473-796df0869020",
"metadata": {},
"outputs": [],
"source": [
"cohort_type_pkl = 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())\n",
"with open(cohort_type_pkl, 'wb') as out_file:\n",
" pickle.dump(gene_results, out_file)\n",
"out_files.append(cohort_type_pkl)"
]
},
{
"cell_type": "markdown",
"id": "b8f709bf-9afb-41e2-8b92-a329673b17ea",
"metadata": {},
"source": [
"## Upload Results to HISE\n",
"\n",
"Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps."
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "061fa34e-ac82-4057-bf67-9e0e88e73b34",
"metadata": {},
"outputs": [],
"source": [
"study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'\n",
"title = 'PBMC Reference Pseudobulk Frac Medians {d}'.format(d = date.today())"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "3dc6573c-64e4-4db4-82f1-465e540c35ed",
"metadata": {},
"outputs": [],
"source": [
"in_files = [h5ad_uuid]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "42e769f2-9eec-44e4-8f14-9cd46f10d6a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['5b3a0cc8-1811-4126-90c7-e9cdd41459fd']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"in_files"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "b57ec44f-0376-49d4-80ae-be37f9900195",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl',\n",
" 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_files"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a0034b02-9445-424e-9d68-997d1405aeae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl\n",
"output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv\n",
"output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
"output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl\n",
"you are trying to upload file_ids... ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']. Do you truly want to proceed?\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"(y/n) y\n"
]
},
{
"data": {
"text/plain": [
"{'trace_id': '78f8b150-b1a4-48af-a87c-648dcd2d7ad5',\n",
" 'files': ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl',\n",
" 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
" 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
" 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hisepy.upload.upload_files(\n",
" files = out_files,\n",
" study_space_id = study_space_uuid,\n",
" title = title,\n",
" input_file_ids = in_files\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "ed42395a-e9b8-4987-b0fa-b953afbb3b5e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"Click to view session information
\n",
"\n",
"-----\n",
"anndata 0.10.3\n",
"hisepy 0.3.0\n",
"numpy 1.24.0\n",
"pandas 2.1.4\n",
"scanpy 1.9.6\n",
"scipy 1.11.4\n",
"session_info 1.0.0\n",
"-----\n",
"
\n",
"\n",
"Click to view modules imported as dependencies
\n",
"\n",
"PIL 10.0.1\n",
"anyio NA\n",
"arrow 1.3.0\n",
"asttokens NA\n",
"attr 23.2.0\n",
"attrs 23.2.0\n",
"babel 2.14.0\n",
"beatrix_jupyterlab NA\n",
"brotli NA\n",
"cachetools 5.3.1\n",
"certifi 2024.02.02\n",
"cffi 1.16.0\n",
"charset_normalizer 3.3.2\n",
"cloudpickle 2.2.1\n",
"colorama 0.4.6\n",
"comm 0.1.4\n",
"cryptography 41.0.7\n",
"cycler 0.10.0\n",
"cython_runtime NA\n",
"dateutil 2.8.2\n",
"db_dtypes 1.1.1\n",
"debugpy 1.8.0\n",
"decorator 5.1.1\n",
"defusedxml 0.7.1\n",
"deprecated 1.2.14\n",
"exceptiongroup 1.2.0\n",
"executing 2.0.1\n",
"fastjsonschema NA\n",
"fqdn NA\n",
"google NA\n",
"greenlet 2.0.2\n",
"grpc 1.58.0\n",
"grpc_status NA\n",
"h5py 3.10.0\n",
"idna 3.6\n",
"igraph 0.10.8\n",
"importlib_metadata NA\n",
"ipykernel 6.28.0\n",
"ipython_genutils 0.2.0\n",
"ipywidgets 8.1.1\n",
"isoduration NA\n",
"jedi 0.19.1\n",
"jinja2 3.1.2\n",
"joblib 1.3.2\n",
"json5 NA\n",
"jsonpointer 2.4\n",
"jsonschema 4.20.0\n",
"jsonschema_specifications NA\n",
"jupyter_events 0.9.0\n",
"jupyter_server 2.12.1\n",
"jupyterlab_server 2.25.2\n",
"jwt 2.8.0\n",
"kiwisolver 1.4.5\n",
"leidenalg 0.10.1\n",
"llvmlite 0.41.0\n",
"lz4 4.3.2\n",
"markupsafe 2.1.3\n",
"matplotlib 3.8.0\n",
"matplotlib_inline 0.1.6\n",
"mpl_toolkits NA\n",
"mpmath 1.3.0\n",
"natsort 8.4.0\n",
"nbformat 5.9.2\n",
"numba 0.58.0\n",
"opentelemetry NA\n",
"overrides NA\n",
"packaging 23.2\n",
"parso 0.8.3\n",
"pexpect 4.8.0\n",
"pickleshare 0.7.5\n",
"pkg_resources NA\n",
"platformdirs 4.1.0\n",
"plotly 5.18.0\n",
"prettytable 3.9.0\n",
"prometheus_client NA\n",
"prompt_toolkit 3.0.42\n",
"proto NA\n",
"psutil NA\n",
"ptyprocess 0.7.0\n",
"pure_eval 0.2.2\n",
"pyarrow 13.0.0\n",
"pydev_ipython NA\n",
"pydevconsole NA\n",
"pydevd 2.9.5\n",
"pydevd_file_utils NA\n",
"pydevd_plugins NA\n",
"pydevd_tracing NA\n",
"pygments 2.17.2\n",
"pynvml NA\n",
"pyparsing 3.1.1\n",
"pyreadr 0.5.0\n",
"pythonjsonlogger NA\n",
"pytz 2023.3.post1\n",
"referencing NA\n",
"requests 2.31.0\n",
"rfc3339_validator 0.1.4\n",
"rfc3986_validator 0.1.1\n",
"rpds NA\n",
"send2trash NA\n",
"shapely 1.8.5.post1\n",
"six 1.16.0\n",
"sklearn 1.3.2\n",
"sniffio 1.3.0\n",
"socks 1.7.1\n",
"sql NA\n",
"sqlalchemy 2.0.21\n",
"sqlparse 0.4.4\n",
"stack_data 0.6.2\n",
"sympy 1.12\n",
"termcolor NA\n",
"texttable 1.7.0\n",
"threadpoolctl 3.2.0\n",
"torch 2.1.2+cu121\n",
"torchgen NA\n",
"tornado 6.3.3\n",
"tqdm 4.66.1\n",
"traitlets 5.9.0\n",
"typing_extensions NA\n",
"uri_template NA\n",
"urllib3 1.26.18\n",
"wcwidth 0.2.12\n",
"webcolors 1.13\n",
"websocket 1.7.0\n",
"wrapt 1.15.0\n",
"xarray 2023.12.0\n",
"yaml 6.0.1\n",
"zipp NA\n",
"zmq 25.1.2\n",
"zoneinfo NA\n",
"zstandard 0.22.0\n",
"
\n",
" \n",
"\n",
"-----\n",
"IPython 8.19.0\n",
"jupyter_client 8.6.0\n",
"jupyter_core 5.6.1\n",
"jupyterlab 4.1.2\n",
"notebook 6.5.4\n",
"-----\n",
"Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]\n",
"Linux-5.15.0-1054-gcp-x86_64-with-glibc2.31\n",
"-----\n",
"Session information updated at 2024-03-27 23:45\n",
"
\n",
" "
],
"text/plain": [
""
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import session_info\n",
"session_info.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "738c6b8e-2739-42aa-8eed-131f090f06f0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}