{ "cells": [ { "cell_type": "raw", "id": "7f0c5dc2-90b4-4a57-bc2c-687eb90c576f", "metadata": {}, "source": [ "!pip install -q https://github.com/aifimmunology/multicelltypist/archive/main.zip" ] }, { "cell_type": "code", "execution_count": 1, "id": "660c6563-9bc8-47e0-a39a-5caea64b2b18", "metadata": {}, "outputs": [], "source": [ "import multicelltypist\n", "from datetime import date\n", "import hisepy\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import scanpy as sc" ] }, { "cell_type": "code", "execution_count": 2, "id": "2611fcb9-1cc9-4b22-a3b2-c4a6c4d7c4ae", "metadata": {}, "outputs": [], "source": [ "def read_adata_uuid(h5ad_uuid):\n", " h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)\n", " if not os.path.isdir(h5ad_path):\n", " hise_res = hisepy.reader.cache_files([h5ad_uuid])\n", " h5ad_filename = os.listdir(h5ad_path)[0]\n", " h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)\n", " adata = sc.read_h5ad(h5ad_file)\n", " return adata" ] }, { "cell_type": "code", "execution_count": 3, "id": "b26381f4-75ce-4776-9859-9f7775c198d1", "metadata": {}, "outputs": [], "source": [ "def resample_anndata_min_max(adata, label_column, max_cells=None, min_cells=None, random_state = 3030):\n", " \"\"\"\n", " Resamples an AnnData object based on the cell labels, with the option to resample with \n", " replacement for classes below a specified threshold.\n", "\n", " Parameters:\n", " ad (AnnData): The AnnData object to be resampled.\n", " label_column (str): The column in ad.obs where the labels are stored.\n", " max_cells (int, optional): The maximum number of cells to keep per label. If None, no upper limit is applied.\n", " min_cells (int, optional): The minimum number of cells below which resampling with replacement occurs. If None, no lower limit is applied.\n", " random_state (int, default = 3030): An integer used to set the state of the numpy.random.Generator\n", " \n", " Returns:\n", " AnnData: The resampled AnnData object.\n", " \"\"\"\n", " \n", " labels = adata.obs[label_column].unique()\n", "\n", " subsets = []\n", "\n", " rng = np.random.default_rng(random_state)\n", " \n", " for label in labels:\n", " # Subset AnnData object for the current label\n", " subset = adata.obs[adata.obs[label_column] == label]\n", " \n", " # Resample with replacement if the number of cells is below min_cells and min_cells is defined\n", " if min_cells is not None and subset.shape[0] < min_cells:\n", " subset = subset.sample(min_cells, replace = True, random_state = rng)\n", " # Resample without replacement if the number of cells is greater than max_cells and max_cells is defined\n", " elif max_cells is not None and subset.shape[0] > max_cells:\n", " subset = subset.sample(max_cells, replace = False, random_state = rng)\n", " \n", " subsets.append(subset)\n", "\n", " # Concatenate all subsets\n", " resampled_obs = pd.concat(subsets)\n", " \n", " resampled_adata = adata[resampled_obs.index]\n", " resampled_adata.obs_names_make_unique()\n", "\n", " return resampled_adata" ] }, { "cell_type": "code", "execution_count": 4, "id": "025a1a8c-9a27-425d-856f-d57d06085150", "metadata": {}, "outputs": [], "source": [ "label_column = 'AIFI_L2'\n", "max_cell_number = 80000" ] }, { "cell_type": "markdown", "id": "9f5d2a81-4c8b-41a6-887a-903305be8e41", "metadata": {}, "source": [ "## Read clean, annotated dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "402ba1b7-74b9-45d3-b840-f4e5ea967fdb", "metadata": {}, "outputs": [], "source": [ "h5ad_uuid = '6e8972a5-9463-4230-84b4-a20de055b9c3'" ] }, { "cell_type": "code", "execution_count": 6, "id": "d19d07d9-c602-4c16-ada7-16091f43ef4c", "metadata": {}, "outputs": [], "source": [ "adata = read_adata_uuid(h5ad_uuid)" ] }, { "cell_type": "code", "execution_count": 7, "id": "0e7cc125-7921-4fb0-a3de-c8d5ca493d97", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1823666, 1261)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata.shape" ] }, { "cell_type": "code", "execution_count": 8, "id": "02304406-4c05-459e-aad3-aae00f909516", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AIFI_L2\n", "Naive CD4 T cell 378071\n", "Memory CD4 T cell 321788\n", "CD14 monocyte 269328\n", "Memory CD8 T cell 183096\n", "CD56dim NK cell 133881\n", "Naive CD8 T cell 121167\n", "Naive B cell 86711\n", "gdT 50587\n", "MAIT 48084\n", "Memory B cell 47886\n", "CD16 monocyte 45920\n", "Treg 39087\n", "cDC2 14235\n", "Intermediate monocyte 12671\n", "Transitional B cell 12555\n", "Effector B cell 11329\n", "CD56bright NK cell 11055\n", "Platelet 7903\n", "pDC 7587\n", "CD8aa 5737\n", "Proliferating NK cell 2825\n", "DN T cell 2349\n", "Proliferating T cell 2320\n", "Plasma cell 2151\n", "Progenitor cell 1526\n", "Erythrocyte 1508\n", "cDC1 943\n", "ILC 844\n", "ASDC 522\n", "Name: count, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata.obs[label_column].value_counts()" ] }, { "cell_type": "markdown", "id": "afd7084d-0f4e-4aa8-b538-af3b316e4992", "metadata": {}, "source": [ "## Sample and prepare reference data" ] }, { "cell_type": "code", "execution_count": 9, "id": "139d3a45-9492-4ffb-af7a-79b28722a034", "metadata": {}, "outputs": [], "source": [ "adata_subset = resample_anndata_min_max(\n", " adata, \n", " label_column, \n", " max_cells = max_cell_number,\n", " random_state = 3030\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "id": "bc027836-a9ce-44d2-95ec-50b90f4a74e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(889624, 1261)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "bb48435e-91ec-4e20-9ff6-1459c785d784", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AIFI_L2\n", "CD14 monocyte 80000\n", "CD56dim NK cell 80000\n", "Naive B cell 80000\n", "Memory CD8 T cell 80000\n", "Memory CD4 T cell 80000\n", "Naive CD4 T cell 80000\n", "Naive CD8 T cell 80000\n", "gdT 50587\n", "MAIT 48084\n", "Memory B cell 47886\n", "CD16 monocyte 45920\n", "Treg 39087\n", "cDC2 14235\n", "Intermediate monocyte 12671\n", "Transitional B cell 12555\n", "Effector B cell 11329\n", "CD56bright NK cell 11055\n", "Platelet 7903\n", "pDC 7587\n", "CD8aa 5737\n", "Proliferating NK cell 2825\n", "DN T cell 2349\n", "Proliferating T cell 2320\n", "Plasma cell 2151\n", "Progenitor cell 1526\n", "Erythrocyte 1508\n", "cDC1 943\n", "ILC 844\n", "ASDC 522\n", "Name: count, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset.obs[label_column].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "id": "8d8c478d-2cf3-425e-9327-61af4db59d2c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(889624, 33538)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset = adata_subset.raw.to_adata()\n", "adata_subset.shape" ] }, { "cell_type": "code", "execution_count": 13, "id": "2596500e-62c7-4fa5-bd16-63cacc8331b0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING: adata.X seems to be already log-transformed.\n" ] } ], "source": [ "sc.pp.normalize_total(adata_subset, target_sum=1e4)\n", "sc.pp.log1p(adata_subset)" ] }, { "cell_type": "markdown", "id": "1b5fccae-79fd-4eb9-a4a8-f68536db9dda", "metadata": {}, "source": [ "## Generate Initial model" ] }, { "cell_type": "code", "execution_count": 14, "id": "7ed68f41-5230-477e-b57f-3bcf1ad6bba8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "🍳 Preparing data before training\n", "✂️ 4486 non-expressed genes are filtered out\n", "🔬 Input data has 889624 cells and 29052 genes\n", "⚖️ Scaling input data\n", "🏋️ Training data using SGD logistic regression\n", "⚠️ Warning: it may take a long time to train this dataset with 889624 cells and 29052 genes, try to downsample cells and/or restrict genes to a subset (e.g., hvgs)\n", "✅ Model training done!\n" ] } ], "source": [ "model_fs = multicelltypist.train(\n", " adata_subset, \n", " label_column, \n", " n_jobs = 60, \n", " max_iter = 10, \n", " multi_class = 'ovr', \n", " use_SGD = True\n", ")" ] }, { "cell_type": "markdown", "id": "c0b4b82a-787e-41bd-a2a9-b847b0e84770", "metadata": {}, "source": [ "## Identify top features used for the model" ] }, { "cell_type": "markdown", "id": "265dd0e4-14f4-4114-98fc-09af92018bf5", "metadata": {}, "source": [ "Detected genes:" ] }, { "cell_type": "code", "execution_count": 15, "id": "f98c3839-7e99-4643-82c8-254a2bd89ce1", "metadata": {}, "outputs": [], "source": [ "df = adata_subset.X.toarray()\n", "flag = df.sum(axis = 0) == 0\n", "gene = adata_subset.var_names[ ~flag]" ] }, { "cell_type": "markdown", "id": "ce3de9a5-dcf9-4d02-85f4-034143633098", "metadata": {}, "source": [ "Features with high absolute classifier coefficients for each cell type class\n", "\n", "`np.argpartition` will take the coefficient scores for each class, and retrieve the positions of the highest absolute coefficient scores to the end of an array of positions. We then select the `top_n` positions from the end of our array of positions, which allow us to retrieve genes with the highest absolute coefficients for each class.\n", "\n", "We can then combine these to get a unique list of genes that are important for our model." ] }, { "cell_type": "code", "execution_count": 16, "id": "bcb18ca4-360c-40de-a3c8-233deff3d7ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of genes selected: 1931\n" ] } ], "source": [ "top_n = 200\n", "\n", "gene_index = np.argpartition(\n", " np.abs(model_fs.classifier.coef_),\n", " -top_n,\n", " axis = 1\n", ")\n", "gene_index = gene_index[:, -top_n:]\n", "gene_index = np.unique(gene_index)\n", "\n", "print('Number of genes selected: {n}'.format(n = len(gene_index)))" ] }, { "cell_type": "code", "execution_count": 17, "id": "5e6a0dc7-418e-4400-8cc2-4ca286b393fe", "metadata": {}, "outputs": [], "source": [ "selected_genes = gene[gene_index.tolist()]\n", "selected_df = pd.DataFrame({'gene': selected_genes})" ] }, { "cell_type": "code", "execution_count": 18, "id": "76116481-6d8c-4539-9cc8-095cc31d390e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | gene | \n", "
---|---|
0 | \n", "HES4 | \n", "
1 | \n", "ISG15 | \n", "
2 | \n", "TTLL10 | \n", "
3 | \n", "TNFRSF18 | \n", "
4 | \n", "TNFRSF4 | \n", "
\n", "-----\n", "anndata 0.10.3\n", "hisepy 0.3.0\n", "multicelltypist 1.6.2\n", "numpy 1.25.2\n", "pandas 2.1.4\n", "scanpy 1.9.6\n", "session_info 1.0.0\n", "-----\n", "\n", "
\n", "PIL 10.0.1\n", "anyio NA\n", "arrow 1.3.0\n", "asttokens NA\n", "attr 23.2.0\n", "attrs 23.2.0\n", "babel 2.14.0\n", "beatrix_jupyterlab NA\n", "brotli NA\n", "cachetools 5.3.1\n", "certifi 2024.02.02\n", "cffi 1.16.0\n", "charset_normalizer 3.3.2\n", "cloudpickle 2.2.1\n", "colorama 0.4.6\n", "comm 0.1.4\n", "cryptography 41.0.7\n", "cycler 0.10.0\n", "cython_runtime NA\n", "dateutil 2.8.2\n", "db_dtypes 1.1.1\n", "debugpy 1.8.0\n", "decorator 5.1.1\n", "defusedxml 0.7.1\n", "deprecated 1.2.14\n", "exceptiongroup 1.2.0\n", "executing 2.0.1\n", "fastjsonschema NA\n", "fqdn NA\n", "google NA\n", "greenlet 2.0.2\n", "grpc 1.58.0\n", "grpc_status NA\n", "h5py 3.10.0\n", "idna 3.6\n", "igraph 0.10.8\n", "importlib_metadata NA\n", "ipykernel 6.28.0\n", "ipython_genutils 0.2.0\n", "ipywidgets 8.1.1\n", "isoduration NA\n", "jedi 0.19.1\n", "jinja2 3.1.2\n", "joblib 1.3.2\n", "json5 NA\n", "jsonpointer 2.4\n", "jsonschema 4.20.0\n", "jsonschema_specifications NA\n", "jupyter_events 0.9.0\n", "jupyter_server 2.12.1\n", "jupyterlab_server 2.25.2\n", "jwt 2.8.0\n", "kiwisolver 1.4.5\n", "leidenalg 0.10.1\n", "llvmlite 0.41.0\n", "lz4 4.3.2\n", "markupsafe 2.1.3\n", "matplotlib 3.8.0\n", "matplotlib_inline 0.1.6\n", "mpl_toolkits NA\n", "mpmath 1.3.0\n", "natsort 8.4.0\n", "nbformat 5.9.2\n", "numba 0.58.0\n", "opentelemetry NA\n", "overrides NA\n", "packaging 23.2\n", "parso 0.8.3\n", "pexpect 4.8.0\n", "pickleshare 0.7.5\n", "pkg_resources NA\n", "platformdirs 4.1.0\n", "plotly 5.18.0\n", "prettytable 3.9.0\n", "prometheus_client NA\n", "prompt_toolkit 3.0.42\n", "proto NA\n", "psutil NA\n", "ptyprocess 0.7.0\n", "pure_eval 0.2.2\n", "pyarrow 13.0.0\n", "pydev_ipython NA\n", "pydevconsole NA\n", "pydevd 2.9.5\n", "pydevd_file_utils NA\n", "pydevd_plugins NA\n", "pydevd_tracing NA\n", "pygments 2.17.2\n", "pynvml NA\n", "pyparsing 3.1.1\n", "pyreadr 0.5.0\n", "pythonjsonlogger NA\n", "pytz 2023.3.post1\n", "referencing NA\n", "requests 2.31.0\n", "rfc3339_validator 0.1.4\n", "rfc3986_validator 0.1.1\n", "rpds NA\n", "scipy 1.11.4\n", "send2trash NA\n", "shapely 1.8.5.post1\n", "six 1.16.0\n", "sklearn 1.3.2\n", "sniffio 1.3.0\n", "socks 1.7.1\n", "sql NA\n", "sqlalchemy 2.0.21\n", "sqlparse 0.4.4\n", "stack_data 0.6.2\n", "sympy 1.12\n", "termcolor NA\n", "texttable 1.7.0\n", "threadpoolctl 3.2.0\n", "torch 2.1.2+cu121\n", "torchgen NA\n", "tornado 6.3.3\n", "tqdm 4.66.1\n", "traitlets 5.9.0\n", "typing_extensions NA\n", "uri_template NA\n", "urllib3 1.26.18\n", "wcwidth 0.2.12\n", "webcolors 1.13\n", "websocket 1.7.0\n", "wrapt 1.15.0\n", "xarray 2023.12.0\n", "yaml 6.0.1\n", "zipp NA\n", "zmq 25.1.2\n", "zoneinfo NA\n", "zstandard 0.22.0\n", "\n", "
\n", "-----\n", "IPython 8.19.0\n", "jupyter_client 8.6.0\n", "jupyter_core 5.6.1\n", "jupyterlab 4.1.2\n", "notebook 6.5.4\n", "-----\n", "Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]\n", "Linux-5.15.0-1042-gcp-x86_64-with-glibc2.31\n", "-----\n", "Session information updated at 2024-03-10 03:22\n", "\n", "