{ "cells": [ { "cell_type": "raw", "id": "85555ba6-7a27-4bc3-9dc4-8f75a6cc2b21", "metadata": {}, "source": [ "!pip install -q https://github.com/aifimmunology/multicelltypist/archive/main.zip" ] }, { "cell_type": "code", "execution_count": 1, "id": "660c6563-9bc8-47e0-a39a-5caea64b2b18", "metadata": {}, "outputs": [], "source": [ "import multicelltypist\n", "from datetime import date\n", "import hisepy\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import scanpy as sc" ] }, { "cell_type": "markdown", "id": "2ceef7cd-6533-4e9e-a459-9e5f7ee53f3e", "metadata": {}, "source": [ "Retrieve 10x Genomics Flex Probe gene list" ] }, { "cell_type": "code", "execution_count": 2, "id": "ec900744-16fb-41c1-885a-775b0419135b", "metadata": {}, "outputs": [], "source": [ "cache_res = hisepy.download_from_project_store(\n", " store_name = 'rds', # Reference Data Sets Project Store\n", " file_name = 'AIFI-2024-03-11T02:09:16.856602896Z/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.probe_metadata.tsv', # File from 10x Genomics\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "id": "e77b5a30-41bd-441c-93db-ec00143822bf", "metadata": {}, "outputs": [], "source": [ "probe_file = 'rds/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.probe_metadata.tsv'\n", "flex_probes = pd.read_csv(probe_file, sep = '\\t')" ] }, { "cell_type": "code", "execution_count": 4, "id": "8c9d7492-55a9-464f-b33d-1241d8cdd3ef", "metadata": {}, "outputs": [], "source": [ "flex_genes = flex_probes['gene_name'].unique().tolist()" ] }, { "cell_type": "code", "execution_count": 5, "id": "60e60a02-8cdf-4c20-8c34-381a6c5bf264", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18529" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(flex_genes)" ] }, { "cell_type": "code", "execution_count": 6, "id": "2611fcb9-1cc9-4b22-a3b2-c4a6c4d7c4ae", "metadata": {}, "outputs": [], "source": [ "def read_adata_uuid(h5ad_uuid):\n", " h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)\n", " if not os.path.isdir(h5ad_path):\n", " hise_res = hisepy.reader.cache_files([h5ad_uuid])\n", " h5ad_filename = os.listdir(h5ad_path)[0]\n", " h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)\n", " adata = sc.read_h5ad(h5ad_file)\n", " return adata" ] }, { "cell_type": "code", "execution_count": 7, "id": "b26381f4-75ce-4776-9859-9f7775c198d1", "metadata": {}, "outputs": [], "source": [ "def resample_anndata_min_max(adata, label_column, max_cells=None, min_cells=None, random_state = 3030):\n", " \"\"\"\n", " Resamples an AnnData object based on the cell labels, with the option to resample with \n", " replacement for classes below a specified threshold.\n", "\n", " Parameters:\n", " ad (AnnData): The AnnData object to be resampled.\n", " label_column (str): The column in ad.obs where the labels are stored.\n", " max_cells (int, optional): The maximum number of cells to keep per label. If None, no upper limit is applied.\n", " min_cells (int, optional): The minimum number of cells below which resampling with replacement occurs. If None, no lower limit is applied.\n", " random_state (int, default = 3030): An integer used to set the state of the numpy.random.Generator\n", " \n", " Returns:\n", " AnnData: The resampled AnnData object.\n", " \"\"\"\n", " \n", " labels = adata.obs[label_column].unique()\n", "\n", " subsets = []\n", "\n", " rng = np.random.default_rng(random_state)\n", " \n", " for label in labels:\n", " # Subset AnnData object for the current label\n", " subset = adata.obs[adata.obs[label_column] == label]\n", " \n", " # Resample with replacement if the number of cells is below min_cells and min_cells is defined\n", " if min_cells is not None and subset.shape[0] < min_cells:\n", " subset = subset.sample(min_cells, replace = True, random_state = rng)\n", " # Resample without replacement if the number of cells is greater than max_cells and max_cells is defined\n", " elif max_cells is not None and subset.shape[0] > max_cells:\n", " subset = subset.sample(max_cells, replace = False, random_state = rng)\n", " \n", " subsets.append(subset)\n", "\n", " # Concatenate all subsets\n", " resampled_obs = pd.concat(subsets)\n", " \n", " resampled_adata = adata[resampled_obs.index]\n", " resampled_adata.obs_names_make_unique()\n", "\n", " return resampled_adata" ] }, { "cell_type": "code", "execution_count": 8, "id": "025a1a8c-9a27-425d-856f-d57d06085150", "metadata": {}, "outputs": [], "source": [ "label_column = 'AIFI_L3'\n", "max_cell_number = 20000" ] }, { "cell_type": "markdown", "id": "9f5d2a81-4c8b-41a6-887a-903305be8e41", "metadata": {}, "source": [ "## Read clean, annotated dataset" ] }, { "cell_type": "code", "execution_count": 9, "id": "402ba1b7-74b9-45d3-b840-f4e5ea967fdb", "metadata": {}, "outputs": [], "source": [ "h5ad_uuid = '6e8972a5-9463-4230-84b4-a20de055b9c3'" ] }, { "cell_type": "code", "execution_count": 10, "id": "d19d07d9-c602-4c16-ada7-16091f43ef4c", "metadata": {}, "outputs": [], "source": [ "adata = read_adata_uuid(h5ad_uuid)" ] }, { "cell_type": "code", "execution_count": 11, "id": "0e7cc125-7921-4fb0-a3de-c8d5ca493d97", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1823666, 1261)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata.shape" ] }, { "cell_type": "code", "execution_count": 12, "id": "02304406-4c05-459e-aad3-aae00f909516", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AIFI_L3\n", "Core naive CD4 T cell 341521\n", "Core CD14 monocyte 217576\n", "CM CD4 T cell 161769\n", "Core naive CD8 T cell 115126\n", "GZMK- CD56dim NK cell 102908\n", " ... \n", "ASDC 522\n", "GZMK+ memory CD4 Treg 467\n", "Activated memory B cell 433\n", "CLP cell 373\n", "BaEoMaP cell 78\n", "Name: count, Length: 72, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata.obs[label_column].value_counts()" ] }, { "cell_type": "markdown", "id": "afd7084d-0f4e-4aa8-b538-af3b316e4992", "metadata": {}, "source": [ "## Sample and prepare reference data" ] }, { "cell_type": "code", "execution_count": 13, "id": "139d3a45-9492-4ffb-af7a-79b28722a034", "metadata": {}, "outputs": [], "source": [ "adata_subset = resample_anndata_min_max(\n", " adata, \n", " label_column, \n", " max_cells = max_cell_number,\n", " random_state = 3030\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "id": "bc027836-a9ce-44d2-95ec-50b90f4a74e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(636082, 1261)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset.shape" ] }, { "cell_type": "code", "execution_count": 15, "id": "bb48435e-91ec-4e20-9ff6-1459c785d784", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AIFI_L3\n", "CM CD4 T cell 20000\n", "CM CD8 T cell 20000\n", "KLRF1+ GZMB+ CD27- EM CD8 T cell 20000\n", "KLRF1- GZMB+ CD27- EM CD8 T cell 20000\n", "Core naive CD4 T cell 20000\n", " ... \n", "ASDC 522\n", "GZMK+ memory CD4 Treg 467\n", "Activated memory B cell 433\n", "CLP cell 373\n", "BaEoMaP cell 78\n", "Name: count, Length: 72, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset.obs[label_column].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "id": "8d8c478d-2cf3-425e-9327-61af4db59d2c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(636082, 33538)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset = adata_subset.raw.to_adata()\n", "adata_subset.shape" ] }, { "cell_type": "markdown", "id": "97f82c68-0aa5-4cc1-a9c2-c3d26d5248fd", "metadata": {}, "source": [ "### Limit to genes available in 10x Flex Probes" ] }, { "cell_type": "code", "execution_count": 17, "id": "81b69db8-f4e2-4c54-a56c-375a10c357c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "18329" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keep_var = adata_subset.var.index.isin(flex_genes)\n", "sum(keep_var)" ] }, { "cell_type": "code", "execution_count": 18, "id": "c33e4f12-48ed-4343-a4af-19c8a930e2d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(636082, 18329)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset = adata_subset[:,keep_var]\n", "adata_subset.shape" ] }, { "cell_type": "code", "execution_count": 19, "id": "2596500e-62c7-4fa5-bd16-63cacc8331b0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING: adata.X seems to be already log-transformed.\n" ] } ], "source": [ "sc.pp.normalize_total(adata_subset, target_sum=1e4)\n", "sc.pp.log1p(adata_subset)" ] }, { "cell_type": "markdown", "id": "1b5fccae-79fd-4eb9-a4a8-f68536db9dda", "metadata": {}, "source": [ "## Generate Initial model" ] }, { "cell_type": "code", "execution_count": 20, "id": "7ed68f41-5230-477e-b57f-3bcf1ad6bba8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "🍳 Preparing data before training\n", "✂️ 1096 non-expressed genes are filtered out\n", "🔬 Input data has 636082 cells and 17233 genes\n", "⚖️ Scaling input data\n", "🏋️ Training data using SGD logistic regression\n", "⚠️ Warning: it may take a long time to train this dataset with 636082 cells and 17233 genes, try to downsample cells and/or restrict genes to a subset (e.g., hvgs)\n", "✅ Model training done!\n" ] } ], "source": [ "model_fs = multicelltypist.train(\n", " adata_subset, \n", " label_column, \n", " n_jobs = 60, \n", " max_iter = 10, \n", " multi_class = 'ovr', \n", " use_SGD = True\n", ")" ] }, { "cell_type": "markdown", "id": "c0b4b82a-787e-41bd-a2a9-b847b0e84770", "metadata": {}, "source": [ "## Identify top features used for the model" ] }, { "cell_type": "markdown", "id": "265dd0e4-14f4-4114-98fc-09af92018bf5", "metadata": {}, "source": [ "Detected genes:" ] }, { "cell_type": "code", "execution_count": 21, "id": "f98c3839-7e99-4643-82c8-254a2bd89ce1", "metadata": {}, "outputs": [], "source": [ "df = adata_subset.X.toarray()\n", "flag = df.sum(axis = 0) == 0\n", "gene = adata_subset.var_names[ ~flag]" ] }, { "cell_type": "markdown", "id": "ce3de9a5-dcf9-4d02-85f4-034143633098", "metadata": {}, "source": [ "Features with high absolute classifier coefficients for each cell type class\n", "\n", "`np.argpartition` will take the coefficient scores for each class, and retrieve the positions of the highest absolute coefficient scores to the end of an array of positions. We then select the `top_n` positions from the end of our array of positions, which allow us to retrieve genes with the highest absolute coefficients for each class.\n", "\n", "We can then combine these to get a unique list of genes that are important for our model." ] }, { "cell_type": "code", "execution_count": 22, "id": "bcb18ca4-360c-40de-a3c8-233deff3d7ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of genes selected: 2452\n" ] } ], "source": [ "top_n = 200\n", "\n", "gene_index = np.argpartition(\n", " np.abs(model_fs.classifier.coef_),\n", " -top_n,\n", " axis = 1\n", ")\n", "gene_index = gene_index[:, -top_n:]\n", "gene_index = np.unique(gene_index)\n", "\n", "print('Number of genes selected: {n}'.format(n = len(gene_index)))" ] }, { "cell_type": "code", "execution_count": 23, "id": "5e6a0dc7-418e-4400-8cc2-4ca286b393fe", "metadata": {}, "outputs": [], "source": [ "selected_genes = gene[gene_index.tolist()]\n", "selected_df = pd.DataFrame({'gene': selected_genes})" ] }, { "cell_type": "code", "execution_count": 24, "id": "76116481-6d8c-4539-9cc8-095cc31d390e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | gene | \n", "
---|---|
0 | \n", "HES4 | \n", "
1 | \n", "ISG15 | \n", "
2 | \n", "TTLL10 | \n", "
3 | \n", "TNFRSF18 | \n", "
4 | \n", "TNFRSF4 | \n", "
\n", "-----\n", "anndata 0.10.3\n", "hisepy 0.3.0\n", "multicelltypist 1.6.2\n", "numpy 1.25.2\n", "pandas 2.1.4\n", "scanpy 1.9.6\n", "session_info 1.0.0\n", "-----\n", "\n", "
\n", "PIL 10.0.1\n", "anyio NA\n", "arrow 1.3.0\n", "asttokens NA\n", "attr 23.2.0\n", "attrs 23.2.0\n", "babel 2.14.0\n", "beatrix_jupyterlab NA\n", "brotli NA\n", "cachetools 5.3.1\n", "certifi 2024.02.02\n", "cffi 1.16.0\n", "charset_normalizer 3.3.2\n", "cloudpickle 2.2.1\n", "colorama 0.4.6\n", "comm 0.1.4\n", "cryptography 41.0.7\n", "cycler 0.10.0\n", "cython_runtime NA\n", "dateutil 2.8.2\n", "db_dtypes 1.1.1\n", "debugpy 1.8.0\n", "decorator 5.1.1\n", "defusedxml 0.7.1\n", "deprecated 1.2.14\n", "exceptiongroup 1.2.0\n", "executing 2.0.1\n", "fastjsonschema NA\n", "fqdn NA\n", "google NA\n", "greenlet 2.0.2\n", "grpc 1.58.0\n", "grpc_status NA\n", "h5py 3.10.0\n", "idna 3.6\n", "igraph 0.10.8\n", "importlib_metadata NA\n", "ipykernel 6.28.0\n", "ipython_genutils 0.2.0\n", "ipywidgets 8.1.1\n", "isoduration NA\n", "jedi 0.19.1\n", "jinja2 3.1.2\n", "joblib 1.3.2\n", "json5 NA\n", "jsonpointer 2.4\n", "jsonschema 4.20.0\n", "jsonschema_specifications NA\n", "jupyter_events 0.9.0\n", "jupyter_server 2.12.1\n", "jupyterlab_server 2.25.2\n", "jwt 2.8.0\n", "kiwisolver 1.4.5\n", "leidenalg 0.10.1\n", "llvmlite 0.41.0\n", "lz4 4.3.2\n", "markupsafe 2.1.3\n", "matplotlib 3.8.0\n", "matplotlib_inline 0.1.6\n", "mpl_toolkits NA\n", "mpmath 1.3.0\n", "natsort 8.4.0\n", "nbformat 5.9.2\n", "numba 0.58.0\n", "opentelemetry NA\n", "overrides NA\n", "packaging 23.2\n", "parso 0.8.3\n", "pexpect 4.8.0\n", "pickleshare 0.7.5\n", "pkg_resources NA\n", "platformdirs 4.1.0\n", "plotly 5.18.0\n", "prettytable 3.9.0\n", "prometheus_client NA\n", "prompt_toolkit 3.0.42\n", "proto NA\n", "psutil NA\n", "ptyprocess 0.7.0\n", "pure_eval 0.2.2\n", "pyarrow 13.0.0\n", "pydev_ipython NA\n", "pydevconsole NA\n", "pydevd 2.9.5\n", "pydevd_file_utils NA\n", "pydevd_plugins NA\n", "pydevd_tracing NA\n", "pygments 2.17.2\n", "pynvml NA\n", "pyparsing 3.1.1\n", "pyreadr 0.5.0\n", "pythonjsonlogger NA\n", "pytz 2023.3.post1\n", "referencing NA\n", "requests 2.31.0\n", "rfc3339_validator 0.1.4\n", "rfc3986_validator 0.1.1\n", "rpds NA\n", "scipy 1.11.4\n", "send2trash NA\n", "shapely 1.8.5.post1\n", "six 1.16.0\n", "sklearn 1.3.2\n", "sniffio 1.3.0\n", "socks 1.7.1\n", "sql NA\n", "sqlalchemy 2.0.21\n", "sqlparse 0.4.4\n", "stack_data 0.6.2\n", "sympy 1.12\n", "termcolor NA\n", "texttable 1.7.0\n", "threadpoolctl 3.2.0\n", "torch 2.1.2+cu121\n", "torchgen NA\n", "tornado 6.3.3\n", "tqdm 4.66.1\n", "traitlets 5.9.0\n", "typing_extensions NA\n", "uri_template NA\n", "urllib3 1.26.18\n", "wcwidth 0.2.12\n", "webcolors 1.13\n", "websocket 1.7.0\n", "wrapt 1.15.0\n", "xarray 2023.12.0\n", "yaml 6.0.1\n", "zipp NA\n", "zmq 25.1.2\n", "zoneinfo NA\n", "zstandard 0.22.0\n", "\n", "
\n", "-----\n", "IPython 8.19.0\n", "jupyter_client 8.6.0\n", "jupyter_core 5.6.1\n", "jupyterlab 4.1.2\n", "notebook 6.5.4\n", "-----\n", "Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]\n", "Linux-5.15.0-1053-gcp-x86_64-with-glibc2.31\n", "-----\n", "Session information updated at 2024-03-12 15:52\n", "\n", "