{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8c81ef75-6ba8-4ad7-b023-defcd7b22887",
   "metadata": {},
   "source": [
    "# Pseudobulk data with fraction and median non-zero\n",
    "\n",
    "For use in plotting, we'll compute \"pseudobulk\" summary statistics for gene expression based on cell metadata grouping.\n",
    "\n",
    "Note that this approach doesn't use the careful pseudobulk approaches implemented in packages like `scran` - we're instead taking the fraction of cells in which gene expression was detected (`pct_exp`) and the median of expression in those non-zero cells (`median_expression`).\n",
    "\n",
    "For use in our visualization tools, we'll group by each of our cell type labeling levels (AIFI_L1, _L2, and _L3), as well as by both cell type and the originating, age-related cohort for these cells."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "397d8491-1006-400e-b28a-f78b98f9a632",
   "metadata": {},
   "source": [
    "## Load packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2b94a539-8d6b-483b-93c1-d6e5ac4893f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n",
    "\n",
    "from datetime import date\n",
    "import hisepy\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import re\n",
    "import scanpy as sc\n",
    "import scipy.sparse as scs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3f700a4a-402c-483f-a13d-7376b31f29a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('output'):\n",
    "    os.mkdir('output')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c5fdbbf7-7c03-468e-b443-dde31833164e",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_files = []"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "61b7c6d3-fbd8-4e52-b53b-cc06ebb0f175",
   "metadata": {},
   "source": [
    "## Helper functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b1863da8-23d2-4b2f-80aa-fa5948276a23",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_adata_uuid(h5ad_uuid):\n",
    "    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)\n",
    "    if not os.path.isdir(h5ad_path):\n",
    "        hise_res = hisepy.cache_files([h5ad_uuid])\n",
    "    h5ad_filename = os.listdir(h5ad_path)[0]\n",
    "    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)\n",
    "    adata = sc.read_h5ad(h5ad_file)\n",
    "    return adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "90ce8067-33c3-4ce2-81a3-9bd69acf4566",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sparse_nz_median(sparse_mat, feat_names):\n",
    "    # transpose the matrix so values for each gene are together in memory\n",
    "    sparse_mat = sparse_mat.transpose().tocsr()\n",
    "    \n",
    "    # get dimensions for calculations\n",
    "    n_cells = sparse_mat.shape[1]\n",
    "    \n",
    "    # compute the fraction non-zero per gene (row = axis 1)\n",
    "    pct_exp = sparse_mat.getnnz(axis = 1) / n_cells\n",
    "    \n",
    "    # split non-zero values from the matrix directly\n",
    "    split_values = np.split(sparse_mat.data, sparse_mat.indptr)\n",
    "    \n",
    "    # remove first and last entry in split - indptr starts with 0 and ends with the last value\n",
    "    # so the first and last split entries are not meaningful\n",
    "    split_values = split_values[1:-1].copy()\n",
    "    \n",
    "    # compute medians for every split\n",
    "    median_expression = [np.median(x) for x in split_values]\n",
    "\n",
    "    # assemble a DataFrame with results\n",
    "    res_df = pd.DataFrame({\n",
    "        'gene': feat_names,\n",
    "        'pct_exp': pct_exp,\n",
    "        'median_expression': median_expression\n",
    "    })\n",
    "    \n",
    "    # replace missing values from splits without any values\n",
    "    # these were not detected so median expression is 0\n",
    "    res_df['median_expression'] = res_df['median_expression'].fillna(0)\n",
    "    \n",
    "    return res_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e82c8a8-1ea0-404d-af6b-daaadf67a5c2",
   "metadata": {},
   "source": [
    "## Retrieve atlas data from HISE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ea3fa84a-b9df-412d-895c-aad9023fb147",
   "metadata": {},
   "outputs": [],
   "source": [
    "h5ad_uuid = '5b3a0cc8-1811-4126-90c7-e9cdd41459fd'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "afab8559-2f7d-4757-a1c3-7ae0c57a5589",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = read_adata_uuid(h5ad_uuid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7d596218-3a23-4075-82c7-d3426d7c7327",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1823666, 33538)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e62ac5a-0bc9-4dee-a6f3-e006418e54e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "obs = adata.obs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd3a8bc6-2f61-4362-9ef7-7f2aee408b59",
   "metadata": {},
   "source": [
    "### Summarized expression by cell type\n",
    "\n",
    "For our UMAP viewer, we plot plot expression at each level of cell type resolution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3e7d51f6-a36e-49e4-85bf-f49ae89b5f29",
   "metadata": {},
   "outputs": [],
   "source": [
    "levels = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b8948d6b-c3fb-406c-b492-d34698d9a6c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>> AIFI_L1\n",
      ">>> AIFI_L2\n",
      ">>> AIFI_L3\n",
      "CPU times: user 7min 2s, sys: 1min 29s, total: 8min 31s\n",
      "Wall time: 8min 31s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "level_results = {}\n",
    "\n",
    "for level in levels:\n",
    "    print('>>> {l}'.format(l = level))\n",
    "    \n",
    "    level_obs = obs.groupby(level)\n",
    "    \n",
    "    level_dict = {}\n",
    "    for cell_type, type_obs in level_obs:\n",
    "        type_bc = type_obs['barcodes']\n",
    "        type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n",
    "        \n",
    "        type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n",
    "        \n",
    "        level_dict[cell_type] = type_res\n",
    "        \n",
    "    level_results[level] = level_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc4f03d7-c118-41e3-bc33-f5dd9e3913e6",
   "metadata": {},
   "source": [
    "### Save assembled results for each level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "10e79d57-408b-4fc4-b704-bca00e342c23",
   "metadata": {},
   "outputs": [],
   "source": [
    "level_combined = {}\n",
    "for level in levels:\n",
    "    all_level = pd.concat(level_results[level])\n",
    "    all_level = all_level.reset_index(drop = False, names = [level, 'row_num'])\n",
    "    all_level = all_level.drop('row_num', axis = 1)\n",
    "    level_combined[level] = all_level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "719f8a2c-4a65-4aa5-a29a-8534ea025405",
   "metadata": {},
   "outputs": [],
   "source": [
    "for level, df in level_combined.items():\n",
    "    out_csv = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())\n",
    "    df.to_csv(out_csv)\n",
    "    out_files.append(out_csv)\n",
    "    out_parquet = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())\n",
    "    df.to_parquet(out_parquet)\n",
    "    out_files.append(out_parquet)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51da7b67-9f3e-45c8-b2de-eb0368452ec6",
   "metadata": {},
   "source": [
    "### Restructure for use in visualization apps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b6231894-f906-4a06-a059-e3022bdca9cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_results = {}\n",
    "for level in levels:\n",
    "    all_res = pd.concat(level_results[level])\n",
    "    split_res = all_res.groupby('gene')\n",
    "    split_dict = {}\n",
    "    for gene, df in split_res:\n",
    "        df = df.reset_index(drop = False)\n",
    "        df = df.drop('level_1', axis = 1)\n",
    "        df = df.rename({'level_0': 'obs_group'}, axis = 1)\n",
    "        split_dict[gene] = df\n",
    "    \n",
    "    gene_results[level] = split_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2b6f5a3-754b-49fc-8ce5-3764482f5cba",
   "metadata": {},
   "source": [
    "Check that a gene or two look accurate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "52457d5c-cd46-4903-b399-64dfbe73868b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>obs_group</th>\n",
       "      <th>gene</th>\n",
       "      <th>pct_exp</th>\n",
       "      <th>median_expression</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.988365</td>\n",
       "      <td>2.726919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.079959</td>\n",
       "      <td>0.725910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Erythrocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.013926</td>\n",
       "      <td>1.230366</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ILC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.120853</td>\n",
       "      <td>1.526564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Monocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.017343</td>\n",
       "      <td>0.818575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NK cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.019809</td>\n",
       "      <td>1.260133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Platelet</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.011135</td>\n",
       "      <td>2.423751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Progenitor cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.150721</td>\n",
       "      <td>0.677979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>T cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.034470</td>\n",
       "      <td>1.093967</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         obs_group   gene   pct_exp  median_expression\n",
       "0           B cell  CD79A  0.988365           2.726919\n",
       "1               DC  CD79A  0.079959           0.725910\n",
       "2      Erythrocyte  CD79A  0.013926           1.230366\n",
       "3              ILC  CD79A  0.120853           1.526564\n",
       "4         Monocyte  CD79A  0.017343           0.818575\n",
       "5          NK cell  CD79A  0.019809           1.260133\n",
       "6         Platelet  CD79A  0.011135           2.423751\n",
       "7  Progenitor cell  CD79A  0.150721           0.677979\n",
       "8           T cell  CD79A  0.034470           1.093967"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_results['AIFI_L1']['CD79A']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "90062555-5530-415a-9027-13a7b251246f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>obs_group</th>\n",
       "      <th>gene</th>\n",
       "      <th>pct_exp</th>\n",
       "      <th>median_expression</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ASDC</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.055556</td>\n",
       "      <td>0.595097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CD8aa</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.035210</td>\n",
       "      <td>1.298121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CD14 monocyte</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.996254</td>\n",
       "      <td>3.177890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CD16 monocyte</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.933907</td>\n",
       "      <td>2.601019</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CD56bright NK cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.048847</td>\n",
       "      <td>1.143188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>CD56dim NK cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.049148</td>\n",
       "      <td>1.298217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DN T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.054066</td>\n",
       "      <td>1.043778</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Effector B cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.052255</td>\n",
       "      <td>0.973340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Erythrocyte</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.070955</td>\n",
       "      <td>1.265223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>ILC</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.046209</td>\n",
       "      <td>1.412349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Intermediate monocyte</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.997159</td>\n",
       "      <td>3.025013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>MAIT</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.048519</td>\n",
       "      <td>1.132813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Memory B cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.048281</td>\n",
       "      <td>1.045450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Memory CD4 T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.049716</td>\n",
       "      <td>1.050017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Memory CD8 T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.048406</td>\n",
       "      <td>1.210995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Naive B cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.041079</td>\n",
       "      <td>1.271097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Naive CD4 T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.047838</td>\n",
       "      <td>1.111647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Naive CD8 T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.044030</td>\n",
       "      <td>1.023475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Plasma cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.103208</td>\n",
       "      <td>0.351336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Platelet</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.053651</td>\n",
       "      <td>2.532171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Progenitor cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.053735</td>\n",
       "      <td>0.564411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Proliferating NK cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.054159</td>\n",
       "      <td>0.986173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Proliferating T cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.064655</td>\n",
       "      <td>0.571081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Transitional B cell</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.042692</td>\n",
       "      <td>1.196304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Treg</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.048686</td>\n",
       "      <td>1.184835</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>cDC1</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.083775</td>\n",
       "      <td>0.453340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>cDC2</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.796488</td>\n",
       "      <td>2.183593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>gdT</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.046415</td>\n",
       "      <td>1.194536</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>pDC</td>\n",
       "      <td>FCN1</td>\n",
       "      <td>0.076183</td>\n",
       "      <td>0.794052</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                obs_group  gene   pct_exp  median_expression\n",
       "0                    ASDC  FCN1  0.055556           0.595097\n",
       "1                   CD8aa  FCN1  0.035210           1.298121\n",
       "2           CD14 monocyte  FCN1  0.996254           3.177890\n",
       "3           CD16 monocyte  FCN1  0.933907           2.601019\n",
       "4      CD56bright NK cell  FCN1  0.048847           1.143188\n",
       "5         CD56dim NK cell  FCN1  0.049148           1.298217\n",
       "6               DN T cell  FCN1  0.054066           1.043778\n",
       "7         Effector B cell  FCN1  0.052255           0.973340\n",
       "8             Erythrocyte  FCN1  0.070955           1.265223\n",
       "9                     ILC  FCN1  0.046209           1.412349\n",
       "10  Intermediate monocyte  FCN1  0.997159           3.025013\n",
       "11                   MAIT  FCN1  0.048519           1.132813\n",
       "12          Memory B cell  FCN1  0.048281           1.045450\n",
       "13      Memory CD4 T cell  FCN1  0.049716           1.050017\n",
       "14      Memory CD8 T cell  FCN1  0.048406           1.210995\n",
       "15           Naive B cell  FCN1  0.041079           1.271097\n",
       "16       Naive CD4 T cell  FCN1  0.047838           1.111647\n",
       "17       Naive CD8 T cell  FCN1  0.044030           1.023475\n",
       "18            Plasma cell  FCN1  0.103208           0.351336\n",
       "19               Platelet  FCN1  0.053651           2.532171\n",
       "20        Progenitor cell  FCN1  0.053735           0.564411\n",
       "21  Proliferating NK cell  FCN1  0.054159           0.986173\n",
       "22   Proliferating T cell  FCN1  0.064655           0.571081\n",
       "23    Transitional B cell  FCN1  0.042692           1.196304\n",
       "24                   Treg  FCN1  0.048686           1.184835\n",
       "25                   cDC1  FCN1  0.083775           0.453340\n",
       "26                   cDC2  FCN1  0.796488           2.183593\n",
       "27                    gdT  FCN1  0.046415           1.194536\n",
       "28                    pDC  FCN1  0.076183           0.794052"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_results['AIFI_L2']['FCN1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e7922203-d2b9-4cda-973d-19b784ec82db",
   "metadata": {},
   "outputs": [],
   "source": [
    "type_pkl = 'output/pbmc-ref_type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())\n",
    "with open(type_pkl, 'wb') as out_file:\n",
    "    pickle.dump(gene_results, out_file)\n",
    "out_files.append(type_pkl)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "966f7334-61fd-44f5-a4ea-0a1c8b00dda5",
   "metadata": {},
   "source": [
    "### Summarized expression by cell type and cohort\n",
    "\n",
    "For our Gene Expression Summary viewer, we plot plot expression at each level of cell type resolution partitioned by each cohort in the reference."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d75604e4-1e6e-4cc3-8cb6-87d10ba420e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">>> AIFI_L1\n",
      ">>> AIFI_L2\n",
      ">>> AIFI_L3\n",
      "CPU times: user 10min 21s, sys: 1min 33s, total: 11min 55s\n",
      "Wall time: 11min 55s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "level_results = {}\n",
    "\n",
    "for level in levels:\n",
    "    print('>>> {l}'.format(l = level))\n",
    "    \n",
    "    level_obs = obs.groupby(['cohort.cohortGuid', level])\n",
    "    \n",
    "    level_dict = {}\n",
    "    for group_tuple, type_obs in level_obs:\n",
    "        cohort = group_tuple[0]\n",
    "        cell_type = group_tuple[1]\n",
    "        c_t = '{c}_{t}'.format(c = cohort, t = cell_type)\n",
    "        \n",
    "        type_bc = type_obs['barcodes']\n",
    "        type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n",
    "        \n",
    "        type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n",
    "        \n",
    "        type_res['cohort'] = [cohort] * type_res.shape[0]\n",
    "        level_dict[c_t] = type_res\n",
    "        \n",
    "    level_results[level] = level_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51c7c28a-edac-4156-b9c5-91c113f7785f",
   "metadata": {},
   "source": [
    "### Save assembled results for each level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "95e62b72-bf5c-45d9-9050-8e6a583d775f",
   "metadata": {},
   "outputs": [],
   "source": [
    "level_combined = {}\n",
    "for level in levels:\n",
    "    all_level = pd.concat(level_results[level])\n",
    "    all_level = all_level.reset_index(drop = False, names = ['group', 'row_num'])\n",
    "    all_level[level] = [re.sub('.+_', '', x) for x in all_level['group']]\n",
    "    all_level = all_level.drop(['group','row_num'], axis = 1)\n",
    "    all_level = all_level[['cohort', level, 'gene', 'pct_exp', 'median_expression']]\n",
    "    level_combined[level] = all_level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7a667426-fdf0-4491-b29b-30857455606b",
   "metadata": {},
   "outputs": [],
   "source": [
    "for level, df in level_combined.items():\n",
    "    out_csv = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())\n",
    "    df.to_csv(out_csv)\n",
    "    out_files.append(out_csv)\n",
    "    out_parquet = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())\n",
    "    df.to_parquet(out_parquet)\n",
    "    out_files.append(out_parquet)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ffa5da8-6c12-40d9-86e8-99483e2af242",
   "metadata": {},
   "source": [
    "### Restructure for use in visualization apps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "63db080d-2695-454b-a814-b4c348c3e657",
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_results = {}\n",
    "for level in levels:\n",
    "    all_res = pd.concat(level_results[level])\n",
    "    split_res = all_res.groupby('gene')\n",
    "    \n",
    "    split_dict = {}\n",
    "    for gene, df in split_res:\n",
    "        df = df.reset_index(drop = False)\n",
    "        df = df.drop('level_1', axis = 1)\n",
    "        df['obs_group'] = [re.sub('.+_', '', x) for x in df['level_0']]\n",
    "        df = df.drop('level_0', axis = 1)\n",
    "        df = df[['obs_group', 'gene', 'pct_exp', 'median_expression', 'cohort']]\n",
    "        split_dict[gene] = df\n",
    "    gene_results[level] = split_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5780d759-12e8-476b-8663-2ece3ef0de69",
   "metadata": {},
   "source": [
    "Check that a gene or two look accurate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f5837cd7-20c3-4faa-ac69-8783c9811f80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>obs_group</th>\n",
       "      <th>gene</th>\n",
       "      <th>pct_exp</th>\n",
       "      <th>median_expression</th>\n",
       "      <th>cohort</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>B cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.990822</td>\n",
       "      <td>2.746202</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.087709</td>\n",
       "      <td>0.714458</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Erythrocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>2.958432</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ILC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.119444</td>\n",
       "      <td>1.399105</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Monocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.018308</td>\n",
       "      <td>0.798966</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NK cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.020623</td>\n",
       "      <td>1.232686</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Platelet</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.011520</td>\n",
       "      <td>2.419073</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Progenitor cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.157895</td>\n",
       "      <td>0.659168</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>T cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.041634</td>\n",
       "      <td>1.068730</td>\n",
       "      <td>BR1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>B cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.988087</td>\n",
       "      <td>2.716038</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>DC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.072828</td>\n",
       "      <td>0.710909</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Erythrocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.007937</td>\n",
       "      <td>1.125242</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>ILC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.113139</td>\n",
       "      <td>1.515654</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Monocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.016689</td>\n",
       "      <td>0.844489</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>NK cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.019250</td>\n",
       "      <td>1.271189</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Platelet</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.009580</td>\n",
       "      <td>2.481542</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Progenitor cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.128664</td>\n",
       "      <td>0.651074</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>T cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.023565</td>\n",
       "      <td>1.097746</td>\n",
       "      <td>BR2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>B cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.983492</td>\n",
       "      <td>2.706248</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>DC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.077181</td>\n",
       "      <td>0.821561</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Erythrocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.015625</td>\n",
       "      <td>1.007309</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>ILC</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>1.595389</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Monocyte</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.015871</td>\n",
       "      <td>0.794617</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>NK cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.018834</td>\n",
       "      <td>1.324052</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Platelet</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.015686</td>\n",
       "      <td>2.211980</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Progenitor cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.198830</td>\n",
       "      <td>0.790927</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>T cell</td>\n",
       "      <td>CD79A</td>\n",
       "      <td>0.042297</td>\n",
       "      <td>1.163522</td>\n",
       "      <td>UP1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          obs_group   gene   pct_exp  median_expression cohort\n",
       "0            B cell  CD79A  0.990822           2.746202    BR1\n",
       "1                DC  CD79A  0.087709           0.714458    BR1\n",
       "2       Erythrocyte  CD79A  0.020000           2.958432    BR1\n",
       "3               ILC  CD79A  0.119444           1.399105    BR1\n",
       "4          Monocyte  CD79A  0.018308           0.798966    BR1\n",
       "5           NK cell  CD79A  0.020623           1.232686    BR1\n",
       "6          Platelet  CD79A  0.011520           2.419073    BR1\n",
       "7   Progenitor cell  CD79A  0.157895           0.659168    BR1\n",
       "8            T cell  CD79A  0.041634           1.068730    BR1\n",
       "9            B cell  CD79A  0.988087           2.716038    BR2\n",
       "10               DC  CD79A  0.072828           0.710909    BR2\n",
       "11      Erythrocyte  CD79A  0.007937           1.125242    BR2\n",
       "12              ILC  CD79A  0.113139           1.515654    BR2\n",
       "13         Monocyte  CD79A  0.016689           0.844489    BR2\n",
       "14          NK cell  CD79A  0.019250           1.271189    BR2\n",
       "15         Platelet  CD79A  0.009580           2.481542    BR2\n",
       "16  Progenitor cell  CD79A  0.128664           0.651074    BR2\n",
       "17           T cell  CD79A  0.023565           1.097746    BR2\n",
       "18           B cell  CD79A  0.983492           2.706248    UP1\n",
       "19               DC  CD79A  0.077181           0.821561    UP1\n",
       "20      Erythrocyte  CD79A  0.015625           1.007309    UP1\n",
       "21              ILC  CD79A  0.133333           1.595389    UP1\n",
       "22         Monocyte  CD79A  0.015871           0.794617    UP1\n",
       "23          NK cell  CD79A  0.018834           1.324052    UP1\n",
       "24         Platelet  CD79A  0.015686           2.211980    UP1\n",
       "25  Progenitor cell  CD79A  0.198830           0.790927    UP1\n",
       "26           T cell  CD79A  0.042297           1.163522    UP1"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_results['AIFI_L1']['CD79A']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3940006b-a4f2-4649-8473-796df0869020",
   "metadata": {},
   "outputs": [],
   "source": [
    "cohort_type_pkl = 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())\n",
    "with open(cohort_type_pkl, 'wb') as out_file:\n",
    "    pickle.dump(gene_results, out_file)\n",
    "out_files.append(cohort_type_pkl)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8f709bf-9afb-41e2-8b92-a329673b17ea",
   "metadata": {},
   "source": [
    "## Upload Results to HISE\n",
    "\n",
    "Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "061fa34e-ac82-4057-bf67-9e0e88e73b34",
   "metadata": {},
   "outputs": [],
   "source": [
    "study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'\n",
    "title = 'PBMC Reference Pseudobulk Frac Medians {d}'.format(d = date.today())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "3dc6573c-64e4-4db4-82f1-465e540c35ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "in_files = [h5ad_uuid]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "42e769f2-9eec-44e4-8f14-9cd46f10d6a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['5b3a0cc8-1811-4126-90c7-e9cdd41459fd']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "in_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "b57ec44f-0376-49d4-80ae-be37f9900195",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl',\n",
       " 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       " 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       " 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a0034b02-9445-424e-9d68-997d1405aeae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl\n",
      "output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv\n",
      "output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet\n",
      "output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl\n",
      "you are trying to upload file_ids... ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv', 'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet', 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']. Do you truly want to proceed?\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "(y/n) y\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'trace_id': '78f8b150-b1a4-48af-a87c-648dcd2d7ad5',\n",
       " 'files': ['output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_type_nz-pct-median_pseudobulk_2024-03-27.pkl',\n",
       "  'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_cohort-AIFI_L1_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_cohort-AIFI_L2_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.csv',\n",
       "  'output/pbmc-ref_cohort-AIFI_L3_nz-pct-median_pseudobulk_2024-03-27.parquet',\n",
       "  'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_2024-03-27.pkl']}"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hisepy.upload.upload_files(\n",
    "    files = out_files,\n",
    "    study_space_id = study_space_uuid,\n",
    "    title = title,\n",
    "    input_file_ids = in_files\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "ed42395a-e9b8-4987-b0fa-b953afbb3b5e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<details>\n",
       "<summary>Click to view session information</summary>\n",
       "<pre>\n",
       "-----\n",
       "anndata             0.10.3\n",
       "hisepy              0.3.0\n",
       "numpy               1.24.0\n",
       "pandas              2.1.4\n",
       "scanpy              1.9.6\n",
       "scipy               1.11.4\n",
       "session_info        1.0.0\n",
       "-----\n",
       "</pre>\n",
       "<details>\n",
       "<summary>Click to view modules imported as dependencies</summary>\n",
       "<pre>\n",
       "PIL                         10.0.1\n",
       "anyio                       NA\n",
       "arrow                       1.3.0\n",
       "asttokens                   NA\n",
       "attr                        23.2.0\n",
       "attrs                       23.2.0\n",
       "babel                       2.14.0\n",
       "beatrix_jupyterlab          NA\n",
       "brotli                      NA\n",
       "cachetools                  5.3.1\n",
       "certifi                     2024.02.02\n",
       "cffi                        1.16.0\n",
       "charset_normalizer          3.3.2\n",
       "cloudpickle                 2.2.1\n",
       "colorama                    0.4.6\n",
       "comm                        0.1.4\n",
       "cryptography                41.0.7\n",
       "cycler                      0.10.0\n",
       "cython_runtime              NA\n",
       "dateutil                    2.8.2\n",
       "db_dtypes                   1.1.1\n",
       "debugpy                     1.8.0\n",
       "decorator                   5.1.1\n",
       "defusedxml                  0.7.1\n",
       "deprecated                  1.2.14\n",
       "exceptiongroup              1.2.0\n",
       "executing                   2.0.1\n",
       "fastjsonschema              NA\n",
       "fqdn                        NA\n",
       "google                      NA\n",
       "greenlet                    2.0.2\n",
       "grpc                        1.58.0\n",
       "grpc_status                 NA\n",
       "h5py                        3.10.0\n",
       "idna                        3.6\n",
       "igraph                      0.10.8\n",
       "importlib_metadata          NA\n",
       "ipykernel                   6.28.0\n",
       "ipython_genutils            0.2.0\n",
       "ipywidgets                  8.1.1\n",
       "isoduration                 NA\n",
       "jedi                        0.19.1\n",
       "jinja2                      3.1.2\n",
       "joblib                      1.3.2\n",
       "json5                       NA\n",
       "jsonpointer                 2.4\n",
       "jsonschema                  4.20.0\n",
       "jsonschema_specifications   NA\n",
       "jupyter_events              0.9.0\n",
       "jupyter_server              2.12.1\n",
       "jupyterlab_server           2.25.2\n",
       "jwt                         2.8.0\n",
       "kiwisolver                  1.4.5\n",
       "leidenalg                   0.10.1\n",
       "llvmlite                    0.41.0\n",
       "lz4                         4.3.2\n",
       "markupsafe                  2.1.3\n",
       "matplotlib                  3.8.0\n",
       "matplotlib_inline           0.1.6\n",
       "mpl_toolkits                NA\n",
       "mpmath                      1.3.0\n",
       "natsort                     8.4.0\n",
       "nbformat                    5.9.2\n",
       "numba                       0.58.0\n",
       "opentelemetry               NA\n",
       "overrides                   NA\n",
       "packaging                   23.2\n",
       "parso                       0.8.3\n",
       "pexpect                     4.8.0\n",
       "pickleshare                 0.7.5\n",
       "pkg_resources               NA\n",
       "platformdirs                4.1.0\n",
       "plotly                      5.18.0\n",
       "prettytable                 3.9.0\n",
       "prometheus_client           NA\n",
       "prompt_toolkit              3.0.42\n",
       "proto                       NA\n",
       "psutil                      NA\n",
       "ptyprocess                  0.7.0\n",
       "pure_eval                   0.2.2\n",
       "pyarrow                     13.0.0\n",
       "pydev_ipython               NA\n",
       "pydevconsole                NA\n",
       "pydevd                      2.9.5\n",
       "pydevd_file_utils           NA\n",
       "pydevd_plugins              NA\n",
       "pydevd_tracing              NA\n",
       "pygments                    2.17.2\n",
       "pynvml                      NA\n",
       "pyparsing                   3.1.1\n",
       "pyreadr                     0.5.0\n",
       "pythonjsonlogger            NA\n",
       "pytz                        2023.3.post1\n",
       "referencing                 NA\n",
       "requests                    2.31.0\n",
       "rfc3339_validator           0.1.4\n",
       "rfc3986_validator           0.1.1\n",
       "rpds                        NA\n",
       "send2trash                  NA\n",
       "shapely                     1.8.5.post1\n",
       "six                         1.16.0\n",
       "sklearn                     1.3.2\n",
       "sniffio                     1.3.0\n",
       "socks                       1.7.1\n",
       "sql                         NA\n",
       "sqlalchemy                  2.0.21\n",
       "sqlparse                    0.4.4\n",
       "stack_data                  0.6.2\n",
       "sympy                       1.12\n",
       "termcolor                   NA\n",
       "texttable                   1.7.0\n",
       "threadpoolctl               3.2.0\n",
       "torch                       2.1.2+cu121\n",
       "torchgen                    NA\n",
       "tornado                     6.3.3\n",
       "tqdm                        4.66.1\n",
       "traitlets                   5.9.0\n",
       "typing_extensions           NA\n",
       "uri_template                NA\n",
       "urllib3                     1.26.18\n",
       "wcwidth                     0.2.12\n",
       "webcolors                   1.13\n",
       "websocket                   1.7.0\n",
       "wrapt                       1.15.0\n",
       "xarray                      2023.12.0\n",
       "yaml                        6.0.1\n",
       "zipp                        NA\n",
       "zmq                         25.1.2\n",
       "zoneinfo                    NA\n",
       "zstandard                   0.22.0\n",
       "</pre>\n",
       "</details> <!-- seems like this ends pre, so might as well be explicit -->\n",
       "<pre>\n",
       "-----\n",
       "IPython             8.19.0\n",
       "jupyter_client      8.6.0\n",
       "jupyter_core        5.6.1\n",
       "jupyterlab          4.1.2\n",
       "notebook            6.5.4\n",
       "-----\n",
       "Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]\n",
       "Linux-5.15.0-1054-gcp-x86_64-with-glibc2.31\n",
       "-----\n",
       "Session information updated at 2024-03-27 23:45\n",
       "</pre>\n",
       "</details>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import session_info\n",
    "session_info.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "738c6b8e-2739-42aa-8eed-131f090f06f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}