{ "cells": [ { "cell_type": "code", "execution_count": 22, "id": "f1712a90-fa7c-4736-82f9-e66e1c055204", "metadata": { "tags": [] }, "outputs": [], "source": [ "import celltypist\n", "from celltypist import models\n", "import scanpy as sc\n", "import pandas as pd \n", "import numpy as np\n", "import anndata\n", "import re\n", "import h5py\n", "import scipy.sparse as scs\n", "import concurrent.futures\n", "import scanpy.external as sce\n", "import gc\n", "from concurrent.futures import ThreadPoolExecutor\n", "from concurrent.futures import ProcessPoolExecutor\n", "import copy\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "56603415-baae-45ee-ba35-0caa123b85d9", "metadata": { "tags": [] }, "outputs": [], "source": [ "def run_leiden(adata, resolution, key_added):\n", " # Make a copy of adata for thread safety\n", " adata_copy = copy.deepcopy(adata)\n", " adata_clustering = sc.tl.leiden(adata_copy, resolution=resolution, key_added=key_added, copy=True)\n", " return adata_clustering.obs\n", "\n", "def run_leiden_parallel(adata, tasks):\n", " with ProcessPoolExecutor(max_workers=5) as executor:\n", " # Make deep copies of adata for each task to ensure thread safety\n", " futures = [executor.submit(run_leiden, copy.deepcopy(adata), resolution, key_added) for resolution, key_added in tasks]\n", " \n", " results = [future.result() for future in futures]\n", "\n", " # Assign the results back to the original AnnData object\n", " for result, (_, key_added) in zip(results, tasks):\n", " adata.obs[key_added] = result[key_added]\n", "\n", " return adata" ] }, { "cell_type": "code", "execution_count": 3, "id": "bdddc48c-0234-42a5-85d6-f31bc938553a", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata=sc.read_h5ad('adata_processed.h5ad')" ] }, { "cell_type": "code", "execution_count": 4, "id": "596f66cd-11a8-4c1a-898e-758128203993", "metadata": { "tags": [] }, "outputs": [], "source": [ "Myeloid_cell_population=[\"0\",\"12\",\"14\",\"15\",\"16\",\"18\",\"21\",\"23\",\"24\"]" ] }, { "cell_type": "code", "execution_count": 5, "id": "1f90d9cf-52d6-4bd8-9705-dcf0c12c3445", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=adata[adata.obs['leiden'].isin(Myeloid_cell_population)]" ] }, { "cell_type": "code", "execution_count": 6, "id": "33d70fd1-0558-4740-be95-832d45c8fc70", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "del adata \n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": 7, "id": "92039b6a-8857-4cd0-b1bd-ab403d870c9b", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=adata_subset.raw.to_adata()" ] }, { "cell_type": "code", "execution_count": 8, "id": "c31c38ec-fd2f-4249-937c-154f36187c1d", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset.write_h5ad('Myeloid/Myeloidcells_raw.h5ad')" ] }, { "cell_type": "code", "execution_count": 9, "id": "63625200-390e-4755-ad01-52e4d5f1510d", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_raw.h5ad')" ] }, { "cell_type": "code", "execution_count": 10, "id": "1caf26f6-d7d1-44e9-9c29-ce6017808b49", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset.raw= adata_subset" ] }, { "cell_type": "code", "execution_count": 11, "id": "982cbf10-4642-4975-a64f-acbc45786894", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pp.normalize_total(adata_subset, target_sum=1e4)" ] }, { "cell_type": "code", "execution_count": 12, "id": "a58df468-2c79-4130-a312-a609818a0fca", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING: adata.X seems to be already log-transformed.\n" ] } ], "source": [ "sc.pp.log1p(adata_subset)\n", "sc.pp.highly_variable_genes(adata_subset)\n", "adata_subset = adata_subset[:, adata_subset.var_names[adata_subset.var['highly_variable']]]" ] }, { "cell_type": "code", "execution_count": 13, "id": "21c0b089-e1f7-455d-9a22-f83fc6012b35", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pp.scale(adata_subset)" ] }, { "cell_type": "code", "execution_count": 14, "id": "66ef0c55-4a33-41e2-bdf9-0df07e0449d9", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.tl.pca(adata_subset, svd_solver='arpack')" ] }, { "cell_type": "code", "execution_count": 15, "id": "23d33666-1351-4be1-9cc8-27bd306bd14b", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-11-27 15:32:55,968 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...\n", "Computing initial centroids with sklearn.KMeans...\n", "2023-11-27 15:35:45,952 - harmonypy - INFO - sklearn.KMeans initialization complete.\n", "sklearn.KMeans initialization complete.\n", "2023-11-27 15:35:47,803 - harmonypy - INFO - Iteration 1 of 30\n", "Iteration 1 of 30\n", "2023-11-27 15:39:17,228 - harmonypy - INFO - Iteration 2 of 30\n", "Iteration 2 of 30\n", "2023-11-27 15:42:49,699 - harmonypy - INFO - Converged after 2 iterations\n", "Converged after 2 iterations\n" ] } ], "source": [ "sce.pp.harmony_integrate(adata_subset, 'cohort.cohortGuid',max_iter_harmony = 30)" ] }, { "cell_type": "code", "execution_count": null, "id": "699af738-90a1-4226-b62d-a06db232679c", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "IOStream.flush timed out\n", "IOStream.flush timed out\n", "IOStream.flush timed out\n" ] } ], "source": [ "sc.pp.neighbors(adata_subset, n_neighbors=50,use_rep='X_pca_harmony', n_pcs=30)\n", "sc.tl.umap(adata_subset,min_dist=0.05)" ] }, { "cell_type": "code", "execution_count": 19, "id": "2c4dc81b-4340-4c56-8c95-f7aae4e942fc", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset.write_h5ad('Myeloid/Myeloidcells_processed_pre_leiden.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "7371a4ec-314a-46dd-8eeb-df2a2279f3fb", "metadata": { "tags": [] }, "outputs": [], "source": [ "tasks = [(1, \"leiden_resolution_1\"),(1.5, \"leiden_resolution_1.5\"),(2, \"leiden_resolution_2\"),(2.5, \"leiden_resolution_2.5\"),(3, \"leiden_resolution_3\")]\n", "adata_subset = run_leiden_parallel(adata_subset, tasks)" ] }, { "cell_type": "code", "execution_count": null, "id": "459482ec-dbc6-4617-afde-e1dd0f851615", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset.write_h5ad('Myeloid/Myeloidcells_processed.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "d23a67a9-fb53-467d-87f6-486d2cccd8b9", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=adata_subset.raw.to_adata()\n", "sc.pp.normalize_total(adata_subset, target_sum=1e4)\n", "sc.pp.log1p(adata_subset)\n", "sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_1', method='wilcoxon')\n", "\n", "df_resolution_1=sc.get.rank_genes_groups_df(adata_subset,group=None)\n", "df_resolution_1.to_csv('Myeloid/Myeloid_res1.csv')\n", "\n", "sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_1.5', method='wilcoxon')\n", "df_resolution_1_5=sc.get.rank_genes_groups_df(adata_subset,group=None)\n", "df_resolution_1_5.to_csv('Myeloid/Myeloid_res1.5.csv')\n", "\n", "sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_2', method='wilcoxon')\n", "df_resolution_2=sc.get.rank_genes_groups_df(adata_subset,group=None)\n", "df_resolution_2.to_csv('Myeloid/Myeloid_res2.csv')\n", "\n", "sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_2.5', method='wilcoxon')\n", "df_resolution_2_5=sc.get.rank_genes_groups_df(adata_subset,group=None)\n", "df_resolution_2_5.to_csv('Myeloid/Myeloid_res2.5.csv')\n", "\n", "sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_3', method='wilcoxon')\n", "df_resolution_3=sc.get.rank_genes_groups_df(adata_subset,group=None)\n", "df_resolution_3.to_csv('Myeloid/Myeloid_res3.csv')\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "29a743b5-773c-4164-9033-8d429f22ae7a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 397526 × 1109\n", " obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'seurat_pbmc_type', 'seurat_pbmc_type_score', 'umap_1', 'umap_2', 'well_id', 'subject.biologicalSex', 'subject.ethnicity', 'subject.partnerCode', 'subject.race', 'subject.subjectGuid', 'cohort.cohortGuid', 'sample.visitName', 'sample.visitDetails', 'subject.birthYear', 'CMV.IgG.Serology.Result.Interpretation', 'BMI', 'predicted.celltype.l1.score', 'predicted.celltype.l1', 'predicted.celltype.l2.score', 'predicted.celltype.l2', 'predicted.celltype.l3.score', 'predicted.celltype.l3', 'predicted.celltype.l2.5.score', 'predicted.celltype.l2.5', 'predicted_labels_celltypist', 'majority_voting_celltypist', 'predicted_doublet', 'doublet_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden'\n", " var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'\n", " uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'\n", " obsm: 'X_pca', 'X_pca_harmony', 'X_umap'\n", " varm: 'PCs'\n", " obsp: 'connectivities', 'distances'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata_subset" ] }, { "cell_type": "code", "execution_count": null, "id": "857f0dcc-38d7-4a1a-8b17-35fca7ee16c0", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_processed.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "4d87ea08-2627-46fb-956c-35fcbb48659b", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.tl.umap(adata_subset,min_dist=0.05)" ] }, { "cell_type": "code", "execution_count": null, "id": "fb48bd80-bb8d-4b15-baf6-299d53997954", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset.write_h5ad('Myeloid/Myeloidcells_processed_20231107.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "860176f5-e91b-4417-a598-0271f72da830", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['doublet_score'], size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "01502481-1936-4a05-ab3f-596974f63215", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['predicted.celltype.l2.5'], size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "6c34a4e7-f77b-4cc6-8c45-46813a0b5ecb", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['predicted_labels_celltypist'], size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "659d9810-6234-44e9-8507-300a3230e58a", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['majority_voting_celltypist'], size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "583023d9-94eb-46d5-85b9-1ac35323d09a", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['leiden'], \n", " size=0.5,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "62bf0880-d1e6-4d29-9d7c-c61ba7a4f1be", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['leiden_resolution_1','leiden_resolution_1.5','leiden_resolution_2'], \n", " size=0.5,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "32e2864c-d7e7-47a9-99dc-1c04e7208352", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['CMV.IgG.Serology.Result.Interpretation'], size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "c8e52a53-0647-4fb2-9cd8-2aeba0e866ee", "metadata": { "tags": [] }, "outputs": [], "source": [ "df=pd.read_csv('Myeloid/Myeloid_res2.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "cfb118f6-7a17-4e44-af4e-0683c33ff40f", "metadata": { "tags": [] }, "outputs": [], "source": [ "df=df.groupby('group').head(20).reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "521ad270-4b44-48d5-9216-d087780f2723", "metadata": { "tags": [] }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "groups = df.groupby('group')\n", "\n", "fig, axs = plt.subplots(9, 6, figsize=(10, 30), squeeze=False)\n", "\n", "# Loop through each group and create a scatter plot in the corresponding subplot\n", "for i, (name, group) in enumerate(groups):\n", " row, col = i // 6, i % 6\n", " axs[row, col].scatter(group['scores'], group['names'])\n", " axs[row, col].invert_yaxis()\n", " axs[row, col].set_title(str(name)+\" vs Rest\")\n", " axs[row, col].set_xlabel('U scores')\n", " axs[row, col].set_ylabel('Gene')\n", "fig.tight_layout()\n", "plt.savefig('scatter_plot.png')" ] }, { "cell_type": "code", "execution_count": null, "id": "b32fc5c5-19ab-4714-8ee3-2dd5702e7e9a", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_processed_20231107.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "2e18cf93-bd40-40e5-8298-f77a2ba9c069", "metadata": { "tags": [] }, "outputs": [], "source": [ "adata_subset=adata_subset.raw.to_adata()" ] }, { "cell_type": "code", "execution_count": null, "id": "23cba44e-1f4b-404f-879c-6cbdc612784d", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pp.normalize_total(adata_subset, target_sum=1e4)" ] }, { "cell_type": "code", "execution_count": null, "id": "e104d4e6-d0b7-4aad-9080-933d6cf24506", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pp.log1p(adata_subset)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e78fa1fa-db86-4154-a0c2-2f22f4f5760d", "metadata": { "tags": [] }, "outputs": [], "source": [ "sc.pl.umap(adata_subset, color=['ITGAX','MS4A1','CD22'],use_raw=False, size=2,show=False,ncols=1 ,frameon=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "a9cff280-a2ac-4fbb-9f96-ea004450523a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:root] *", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }