{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5b824523-4e6c-45a1-8b94-4f88f9b85b2e",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting celltypist==1.6.1\n",
      "  Obtaining dependency information for celltypist==1.6.1 from https://files.pythonhosted.org/packages/8f/d8/f60e2df761a08bff1260a83dc9543a2f72364d7cda909eab174d69e216be/celltypist-1.6.1-py3-none-any.whl.metadata\n",
      "  Downloading celltypist-1.6.1-py3-none-any.whl.metadata (43 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy>=1.19.0 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (1.23.5)\n",
      "Requirement already satisfied: pandas>=1.0.5 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (2.0.3)\n",
      "Requirement already satisfied: scikit-learn>=0.24.1 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (1.3.0)\n",
      "Collecting openpyxl>=3.0.4 (from celltypist==1.6.1)\n",
      "  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: click>=7.1.2 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (8.1.6)\n",
      "Requirement already satisfied: requests>=2.23.0 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (2.31.0)\n",
      "Requirement already satisfied: scanpy>=1.7.0 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (1.9.5)\n",
      "Requirement already satisfied: leidenalg>=0.9.0 in /opt/conda/lib/python3.10/site-packages (from celltypist==1.6.1) (0.10.1)\n",
      "Requirement already satisfied: igraph<0.11,>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from leidenalg>=0.9.0->celltypist==1.6.1) (0.10.8)\n",
      "Collecting et-xmlfile (from openpyxl>=3.0.4->celltypist==1.6.1)\n",
      "  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.5->celltypist==1.6.1) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.5->celltypist==1.6.1) (2023.3.post1)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.5->celltypist==1.6.1) (2023.3)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->celltypist==1.6.1) (3.2.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->celltypist==1.6.1) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->celltypist==1.6.1) (1.26.16)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.23.0->celltypist==1.6.1) (2023.7.22)\n",
      "Requirement already satisfied: anndata>=0.7.4 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.9.2)\n",
      "Requirement already satisfied: matplotlib>=3.4 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (3.7.2)\n",
      "Requirement already satisfied: scipy>=1.4 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (1.11.1)\n",
      "Requirement already satisfied: seaborn in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.12.2)\n",
      "Requirement already satisfied: h5py>=3 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (3.9.0)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (4.66.1)\n",
      "Requirement already satisfied: statsmodels>=0.10.0rc2 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.14.0)\n",
      "Requirement already satisfied: patsy in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.5.3)\n",
      "Requirement already satisfied: networkx>=2.3 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (3.1)\n",
      "Requirement already satisfied: natsort in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (8.4.0)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (1.3.1)\n",
      "Requirement already satisfied: numba>=0.41.0 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.57.1)\n",
      "Requirement already satisfied: umap-learn>=0.3.10 in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (0.5.4)\n",
      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (23.1)\n",
      "Requirement already satisfied: session-info in /opt/conda/lib/python3.10/site-packages (from scanpy>=1.7.0->celltypist==1.6.1) (1.0.0)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.1->celltypist==1.6.1) (3.2.0)\n",
      "Requirement already satisfied: texttable>=1.6.2 in /opt/conda/lib/python3.10/site-packages (from igraph<0.11,>=0.10.0->leidenalg>=0.9.0->celltypist==1.6.1) (1.6.7)\n",
      "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (0.11.0)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (4.42.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (1.4.4)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (10.0.0)\n",
      "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy>=1.7.0->celltypist==1.6.1) (3.0.9)\n",
      "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /opt/conda/lib/python3.10/site-packages (from numba>=0.41.0->scanpy>=1.7.0->celltypist==1.6.1) (0.40.1)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.0.5->celltypist==1.6.1) (1.16.0)\n",
      "Requirement already satisfied: pynndescent>=0.5 in /opt/conda/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy>=1.7.0->celltypist==1.6.1) (0.5.10)\n",
      "Requirement already satisfied: tbb>=2019.0 in /opt/conda/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy>=1.7.0->celltypist==1.6.1) (2021.10.0)\n",
      "Requirement already satisfied: stdlib-list in /opt/conda/lib/python3.10/site-packages (from session-info->scanpy>=1.7.0->celltypist==1.6.1) (0.9.0)\n",
      "Downloading celltypist-1.6.1-py3-none-any.whl (7.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m98.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: et-xmlfile, openpyxl, celltypist\n",
      "Successfully installed celltypist-1.6.1 et-xmlfile-1.1.0 openpyxl-3.1.2\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install celltypist==1.6.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e4312d11-9dc4-43f6-8e03-72fc9f557ab1",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: scanpy in /opt/conda/lib/python3.10/site-packages (1.9.5)\n",
      "Requirement already satisfied: anndata>=0.7.4 in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.9.2)\n",
      "Requirement already satisfied: numpy>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from scanpy) (1.23.5)\n",
      "Requirement already satisfied: matplotlib>=3.4 in /opt/conda/lib/python3.10/site-packages (from scanpy) (3.7.2)\n",
      "Requirement already satisfied: pandas>=1.0 in /opt/conda/lib/python3.10/site-packages (from scanpy) (2.0.3)\n",
      "Requirement already satisfied: scipy>=1.4 in /opt/conda/lib/python3.10/site-packages (from scanpy) (1.11.1)\n",
      "Requirement already satisfied: seaborn in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.12.2)\n",
      "Requirement already satisfied: h5py>=3 in /opt/conda/lib/python3.10/site-packages (from scanpy) (3.9.0)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from scanpy) (4.66.1)\n",
      "Requirement already satisfied: scikit-learn>=0.24 in /opt/conda/lib/python3.10/site-packages (from scanpy) (1.3.0)\n",
      "Requirement already satisfied: statsmodels>=0.10.0rc2 in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.14.0)\n",
      "Requirement already satisfied: patsy in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.5.3)\n",
      "Requirement already satisfied: networkx>=2.3 in /opt/conda/lib/python3.10/site-packages (from scanpy) (3.1)\n",
      "Requirement already satisfied: natsort in /opt/conda/lib/python3.10/site-packages (from scanpy) (8.4.0)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from scanpy) (1.3.1)\n",
      "Requirement already satisfied: numba>=0.41.0 in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.57.1)\n",
      "Requirement already satisfied: umap-learn>=0.3.10 in /opt/conda/lib/python3.10/site-packages (from scanpy) (0.5.4)\n",
      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from scanpy) (23.1)\n",
      "Requirement already satisfied: session-info in /opt/conda/lib/python3.10/site-packages (from scanpy) (1.0.0)\n",
      "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (0.11.0)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (4.42.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.4.4)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (10.0.0)\n",
      "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (3.0.9)\n",
      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (2.8.2)\n",
      "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /opt/conda/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (0.40.1)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0->scanpy) (2023.3.post1)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0->scanpy) (2023.3)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24->scanpy) (3.2.0)\n",
      "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from patsy->scanpy) (1.16.0)\n",
      "Requirement already satisfied: pynndescent>=0.5 in /opt/conda/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy) (0.5.10)\n",
      "Requirement already satisfied: tbb>=2019.0 in /opt/conda/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy) (2021.10.0)\n",
      "Requirement already satisfied: stdlib-list in /opt/conda/lib/python3.10/site-packages (from session-info->scanpy) (0.9.0)\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install scanpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ea583980-52bd-46a0-b3f2-24cdff3f435c",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy==1.24.0\n",
      "  Downloading numpy-1.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m86.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: numpy\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.23.5\n",
      "    Uninstalling numpy-1.23.5:\n",
      "      Successfully uninstalled numpy-1.23.5\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "seaborn 0.12.2 requires numpy!=1.24.0,>=1.17, but you have numpy 1.24.0 which is incompatible.\n",
      "ydata-profiling 4.4.0 requires numpy<1.24,>=1.16.0, but you have numpy 1.24.0 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed numpy-1.24.0\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "pip install numpy==1.24.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5bf3c687-3777-495f-8a9a-39e1e65fd449",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import celltypist\n",
    "from celltypist import models\n",
    "import scanpy as sc\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "import anndata\n",
    "import re\n",
    "import h5py\n",
    "import scipy.sparse as scs\n",
    "import concurrent.futures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da78c078-fef1-4c52-9bb4-a7e5fcfd2bf4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0ee84e59-5bbf-454b-9413-d2697de81e3e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def read_mat(h5_con):\n",
    "    mat = scs.csc_matrix(\n",
    "        (h5_con['matrix']['data'][:], # Count values\n",
    "         h5_con['matrix']['indices'][:], # Row indices\n",
    "         h5_con['matrix']['indptr'][:]), # Pointers for column positions\n",
    "        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions\n",
    "    )\n",
    "    return mat\n",
    "\n",
    "# define a function to obeservation (i.e. metadata)\n",
    "\n",
    "def read_obs(h5con):\n",
    "    bc = h5con['matrix']['barcodes'][:]\n",
    "    bc = [x.decode('UTF-8') for x in bc]\n",
    "\n",
    "    # Initialized the DataFrame with cell barcodes\n",
    "    obs_df = pd.DataFrame({ 'barcodes' : bc })\n",
    "\n",
    "    # Get the list of available metadata columns\n",
    "    obs_columns = h5con['matrix']['observations'].keys()\n",
    "\n",
    "    # For each column\n",
    "    for col in obs_columns:\n",
    "        # Read the values\n",
    "        values = h5con['matrix']['observations'][col][:]\n",
    "        # Check for byte storage\n",
    "        if(isinstance(values[0], (bytes, bytearray))):\n",
    "            # Decode byte strings\n",
    "            values = [x.decode('UTF-8') for x in values]\n",
    "        # Add column to the DataFrame\n",
    "        obs_df[col] = values\n",
    "    \n",
    "    return obs_df\n",
    "# define a function to construct anndata object from a h5 file\n",
    "def read_h5_anndata(h5_file):\n",
    "    h5_con = h5py.File(h5_file, mode = 'r')\n",
    "    # extract the expression matrix\n",
    "    mat = read_mat(h5_con)\n",
    "    # extract gene names\n",
    "    genes = h5_con['matrix']['features']['name'][:]\n",
    "    genes = [x.decode('UTF-8') for x in genes]\n",
    "    # extract metadata\n",
    "    obs_df = read_obs(h5_con)\n",
    "    # construct anndata\n",
    "    adata = anndata.AnnData(mat.T,\n",
    "                             obs = obs_df)\n",
    "    # make sure the gene names aligned\n",
    "    adata.var_names = genes\n",
    "\n",
    "    adata.var_names_make_unique()\n",
    "    return adata\n",
    "\n",
    "def get_last_pattern(inputstr):\n",
    "    pattern = r\"[^/]+(?=$)\"\n",
    "    match = re.search(pattern, inputstr)\n",
    "    if match:\n",
    "        return match.group(0)\n",
    "    else:\n",
    "        return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "09612625-f115-46c3-9e75-04540b43967e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "meta_data=pd.read_csv(\"hise_meta_data_2023-11-19.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2de69a63-45c8-40f4-aef7-738830d82e8f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "📜 Retrieving model list from server https://celltypist.cog.sanger.ac.uk/models/models.json\n",
      "📚 Total models in list: 44\n",
      "📂 Storing models in /root/.celltypist/data/models\n",
      "💾 Downloading model [1/44]: Immune_All_Low.pkl\n",
      "💾 Downloading model [2/44]: Immune_All_High.pkl\n",
      "💾 Downloading model [3/44]: Adult_CynomolgusMacaque_Hippocampus.pkl\n",
      "💾 Downloading model [4/44]: Adult_Human_PancreaticIslet.pkl\n",
      "💾 Downloading model [5/44]: Adult_Human_Skin.pkl\n",
      "💾 Downloading model [6/44]: Adult_Mouse_Gut.pkl\n",
      "💾 Downloading model [7/44]: Adult_Mouse_OlfactoryBulb.pkl\n",
      "💾 Downloading model [8/44]: Adult_Pig_Hippocampus.pkl\n",
      "💾 Downloading model [9/44]: Adult_RhesusMacaque_Hippocampus.pkl\n",
      "💾 Downloading model [10/44]: Autopsy_COVID19_Lung.pkl\n",
      "💾 Downloading model [11/44]: COVID19_HumanChallenge_Blood.pkl\n",
      "💾 Downloading model [12/44]: COVID19_Immune_Landscape.pkl\n",
      "💾 Downloading model [13/44]: Cells_Fetal_Lung.pkl\n",
      "💾 Downloading model [14/44]: Cells_Intestinal_Tract.pkl\n",
      "💾 Downloading model [15/44]: Cells_Lung_Airway.pkl\n",
      "💾 Downloading model [16/44]: Developing_Human_Brain.pkl\n",
      "💾 Downloading model [17/44]: Developing_Human_Gonads.pkl\n",
      "💾 Downloading model [18/44]: Developing_Human_Hippocampus.pkl\n",
      "💾 Downloading model [19/44]: Developing_Human_Organs.pkl\n",
      "💾 Downloading model [20/44]: Developing_Human_Thymus.pkl\n",
      "💾 Downloading model [21/44]: Developing_Mouse_Brain.pkl\n",
      "💾 Downloading model [22/44]: Developing_Mouse_Hippocampus.pkl\n",
      "💾 Downloading model [23/44]: Fetal_Human_AdrenalGlands.pkl\n",
      "💾 Downloading model [24/44]: Fetal_Human_Pancreas.pkl\n",
      "💾 Downloading model [25/44]: Fetal_Human_Pituitary.pkl\n",
      "💾 Downloading model [26/44]: Fetal_Human_Retina.pkl\n",
      "💾 Downloading model [27/44]: Fetal_Human_Skin.pkl\n",
      "💾 Downloading model [28/44]: Healthy_Adult_Heart.pkl\n",
      "💾 Downloading model [29/44]: Healthy_COVID19_PBMC.pkl\n",
      "💾 Downloading model [30/44]: Healthy_Human_Liver.pkl\n",
      "💾 Downloading model [31/44]: Healthy_Mouse_Liver.pkl\n",
      "💾 Downloading model [32/44]: Human_AdultAged_Hippocampus.pkl\n",
      "💾 Downloading model [33/44]: Human_Developmental_Retina.pkl\n",
      "💾 Downloading model [34/44]: Human_Embryonic_YolkSac.pkl\n",
      "💾 Downloading model [35/44]: Human_IPF_Lung.pkl\n",
      "💾 Downloading model [36/44]: Human_Longitudinal_Hippocampus.pkl\n",
      "💾 Downloading model [37/44]: Human_Lung_Atlas.pkl\n",
      "💾 Downloading model [38/44]: Human_PF_Lung.pkl\n",
      "💾 Downloading model [39/44]: Lethal_COVID19_Lung.pkl\n",
      "💾 Downloading model [40/44]: Mouse_Dentate_Gyrus.pkl\n",
      "💾 Downloading model [41/44]: Mouse_Isocortex_Hippocampus.pkl\n",
      "💾 Downloading model [42/44]: Mouse_Postnatal_DentateGyrus.pkl\n",
      "💾 Downloading model [43/44]: Nuclei_Lung_Airway.pkl\n",
      "💾 Downloading model [44/44]: Pan_Fetal_Human.pkl\n"
     ]
    }
   ],
   "source": [
    "models.download_models(force_update = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d349f244-27a8-453a-815f-f3422fcdc5d4",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🔬 Input data has 156449 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 193957 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🔬 Input data has 187700 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 179276 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 173666 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 207788 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 199483 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 200104 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 198214 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🔬 Input data has 187235 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "🔬 Input data has 209206 cells and 33538 genes\n",
      "⚖️ Scaling input data\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Can not detect a neighborhood graph, will construct one before the over-clustering\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 156449 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 187235 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🖋️ Predicting labels\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 187700 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 198214 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 209206 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 200104 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 179276 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 173666 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 156449 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 187235 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 207788 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 199483 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 193957 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 5967 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 187700 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 198214 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 209206 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 200104 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🖋️ Predicting labels\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 179276 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 173666 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 207788 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 30\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 199483 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🔬 Input data has 193957 cells and 33538 genes\n",
      "🔗 Matching reference genes in the model\n",
      "🧬 3443 features used for prediction\n",
      "⚖️ Scaling input data\n",
      "🖋️ Predicting labels\n",
      "✅ Prediction done!\n",
      "👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it\n",
      "⛓️ Over-clustering input data with resolution set to 25\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n",
      "🗳️ Majority voting the predictions\n",
      "✅ Majority voting done!\n"
     ]
    }
   ],
   "source": [
    "def process_data(meta_data_sub):\n",
    "    # Process the subset of meta_data\n",
    "    results = []\n",
    "    for file_name in meta_data_sub:\n",
    "        result = read_h5_anndata(file_name)\n",
    "        results.append(result)\n",
    "    adata = anndata.concat(results)\n",
    "    del results\n",
    "    sc.pp.normalize_total(adata, target_sum=1e4)\n",
    "    sc.pp.log1p(adata)\n",
    "    adata.obs.index = adata.obs['barcodes']\n",
    "    predictions = celltypist.annotate(adata, model='Immune_All_Low.pkl', majority_voting=True)\n",
    "    for i in adata.obs['pbmc_sample_id'].unique():\n",
    "        barcodes=adata.obs[adata.obs['pbmc_sample_id'] == i].index.tolist()\n",
    "        predictions.predicted_labels.loc[barcodes,:].to_csv(\"Labels_Celltypist/\" + i + \"_Low.csv\")\n",
    "    \n",
    "    predictions = celltypist.annotate(adata, model='Immune_All_High.pkl', majority_voting=True)\n",
    "    for i in adata.obs['pbmc_sample_id'].unique():\n",
    "        barcodes=adata.obs[adata.obs['pbmc_sample_id'] == i].index.tolist()\n",
    "        predictions.predicted_labels.loc[barcodes,:].to_csv(\"Labels_Celltypist/\" + i + \"_High.csv\")\n",
    "    \n",
    "    predictions = celltypist.annotate(adata, model='Healthy_COVID19_PBMC.pkl', majority_voting=True)\n",
    "    for i in adata.obs['pbmc_sample_id'].unique():\n",
    "        barcodes=adata.obs[adata.obs['pbmc_sample_id'] == i].index.tolist()\n",
    "        predictions.predicted_labels.loc[barcodes,:].to_csv(\"Labels_Celltypist/\" + i + \"_Covid_Healthy.csv\")\n",
    "    del adata\n",
    "if __name__ == '__main__':\n",
    "    # Divide the meta_data into subsets\n",
    "    meta_data_subsets = [meta_data[\"file.path\"][i:i+10] for i in range(0, len(meta_data), 10)]\n",
    "\n",
    "    # Process each subset in parallel\n",
    "    with concurrent.futures.ProcessPoolExecutor(max_workers=11) as executor:\n",
    "        futures = [executor.submit(process_data, meta_data_sub) for meta_data_sub in meta_data_subsets]\n",
    "        for future in concurrent.futures.as_completed(futures):\n",
    "            try:\n",
    "                future.result()\n",
    "            except Exception as e:\n",
    "                print(f'Error: {e}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1df57b01-3914-48ae-9cc2-f03faec7473c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}