{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "4ebc2cd4-cc2c-41ac-ab73-e72142c094fd", "metadata": { "tags": [] }, "outputs": [], "source": [ "import celltypist\n", "from celltypist import models\n", "import scanpy as sc\n", "import pandas as pd \n", "import numpy as np\n", "import anndata\n", "import re\n", "import h5py\n", "import scipy.sparse as scs\n", "import concurrent.futures\n", "import scanpy.external as sce" ] }, { "cell_type": "code", "execution_count": 1, "id": "2484675a-0c6f-4655-bb1a-f0379f04e143", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: scrublet in /opt/conda/lib/python3.10/site-packages (0.2.3)\n", "Requirement already satisfied: cython in /opt/conda/lib/python3.10/site-packages (from scrublet) (0.29.36)\n", "Requirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from scrublet) (1.24.4)\n", "Requirement already satisfied: scipy in /opt/conda/lib/python3.10/site-packages (from scrublet) (1.11.1)\n", "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from scrublet) (1.3.0)\n", "Requirement already satisfied: scikit-image in /opt/conda/lib/python3.10/site-packages (from scrublet) (0.21.0)\n", "Requirement already satisfied: matplotlib in /opt/conda/lib/python3.10/site-packages (from scrublet) (3.7.2)\n", "Requirement already satisfied: annoy in /opt/conda/lib/python3.10/site-packages (from scrublet) (1.17.3)\n", "Requirement already satisfied: numba in /opt/conda/lib/python3.10/site-packages (from scrublet) (0.57.1)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from scrublet) (2.0.3)\n", "Requirement already satisfied: umap-learn in /opt/conda/lib/python3.10/site-packages (from scrublet) (0.5.4)\n", "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (1.1.0)\n", "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (0.11.0)\n", "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (4.42.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (1.4.4)\n", "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (23.1)\n", "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (10.0.0)\n", "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (3.0.9)\n", "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->scrublet) (2.8.2)\n", "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /opt/conda/lib/python3.10/site-packages (from numba->scrublet) (0.40.1)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->scrublet) (2023.3.post1)\n", "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas->scrublet) (2023.3)\n", "Requirement already satisfied: networkx>=2.8 in /opt/conda/lib/python3.10/site-packages (from scikit-image->scrublet) (3.1)\n", "Requirement already satisfied: imageio>=2.27 in /opt/conda/lib/python3.10/site-packages (from scikit-image->scrublet) (2.31.1)\n", "Requirement already satisfied: tifffile>=2022.8.12 in /opt/conda/lib/python3.10/site-packages (from scikit-image->scrublet) (2023.7.18)\n", "Requirement already satisfied: PyWavelets>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-image->scrublet) (1.4.1)\n", "Requirement already satisfied: lazy_loader>=0.2 in /opt/conda/lib/python3.10/site-packages (from scikit-image->scrublet) (0.3)\n", "Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->scrublet) (1.3.1)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->scrublet) (3.2.0)\n", "Requirement already satisfied: pynndescent>=0.5 in /opt/conda/lib/python3.10/site-packages (from umap-learn->scrublet) (0.5.10)\n", "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from umap-learn->scrublet) (4.66.1)\n", "Requirement already satisfied: tbb>=2019.0 in /opt/conda/lib/python3.10/site-packages (from umap-learn->scrublet) (2021.10.0)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib->scrublet) (1.16.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install scrublet" ] }, { "cell_type": "code", "execution_count": 3, "id": "bf053f82-df10-47bb-924d-186399401450", "metadata": { "tags": [] }, "outputs": [], "source": [ "def read_mat(h5_con):\n", " mat = scs.csc_matrix(\n", " (h5_con['matrix']['data'][:], # Count values\n", " h5_con['matrix']['indices'][:], # Row indices\n", " h5_con['matrix']['indptr'][:]), # Pointers for column positions\n", " shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions\n", " )\n", " return mat\n", "\n", "\n", "def read_obs(h5con):\n", " bc = h5con['matrix']['barcodes'][:]\n", " bc = [x.decode('UTF-8') for x in bc]\n", "\n", " # Initialized the DataFrame with cell barcodes\n", " obs_df = pd.DataFrame({ 'barcodes' : bc })\n", "\n", " # Get the list of available metadata columns\n", " obs_columns = h5con['matrix']['observations'].keys()\n", "\n", " # For each column\n", " for col in obs_columns:\n", " # Read the values\n", " values = h5con['matrix']['observations'][col][:]\n", " # Check for byte storage\n", " if(isinstance(values[0], (bytes, bytearray))):\n", " # Decode byte strings\n", " values = [x.decode('UTF-8') for x in values]\n", " # Add column to the DataFrame\n", " obs_df[col] = values\n", " \n", " return obs_df\n", "# define a function to construct anndata object from a h5 file\n", "def read_h5_anndata(h5_file):\n", " h5_con = h5py.File(h5_file, mode = 'r')\n", " # extract the expression matrix\n", " mat = read_mat(h5_con)\n", " # extract gene names\n", " genes = h5_con['matrix']['features']['name'][:]\n", " genes = [x.decode('UTF-8') for x in genes]\n", " # extract metadata\n", " obs_df = read_obs(h5_con)\n", " # construct anndata\n", " adata = anndata.AnnData(mat.T,\n", " obs = obs_df)\n", " # make sure the gene names aligned\n", " adata.var_names = genes\n", "\n", " adata.var_names_make_unique()\n", " return adata\n", "def get_last_pattern(inputstr):\n", " pattern = r\"[^/]+(?=$)\"\n", " match = re.search(pattern, inputstr)\n", " if match:\n", " return match.group(0)\n", " else:\n", " return \"\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "eb096602-819b-4b51-83b9-11d963b21cfb", "metadata": { "tags": [] }, "outputs": [], "source": [ "meta_data=pd.read_csv(\"hise_meta_data_2023-11-19.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "616e22b5-e5a0-4dd1-8417-746d35f72eab", "metadata": { "tags": [] }, "outputs": [], "source": [ "def process_file(file_name):\n", " result = read_h5_anndata(file_name)\n", " sc.external.pp.scrublet(result)\n", " return result.obs[['barcodes','predicted_doublet','doublet_score']]" ] }, { "cell_type": "code", "execution_count": 8, "id": "3d334e1a-a13c-41d2-9107-ec3e2fa4e490", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Automatically set threshold at doublet score = 0.35\n", "Detected doublet rate = 1.7%\n", "Estimated detectable doublet fraction = 29.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 6.0%\n", "Automatically set threshold at doublet score = 0.68\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.6%\n", "Warning: failed to automatically identify doublet score threshold. Run `call_doublets` with user-specified threshold.\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 36.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.3%\n", "Automatically set threshold at doublet score = 0.72\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 0.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.2%\n", "Automatically set threshold at doublet score = 0.38\n", "Detected doublet rate = 1.1%\n", "Estimated detectable doublet fraction = 27.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.9%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 4.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.4%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 2.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.4%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 2.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.0%\n", "Automatically set threshold at doublet score = 0.30\n", "Detected doublet rate = 1.9%\n", "Estimated detectable doublet fraction = 40.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.7%\n", "Automatically set threshold at doublet score = 0.33\n", "Detected doublet rate = 1.7%\n", "Estimated detectable doublet fraction = 35.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.7%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 2.0%\n", "Estimated detectable doublet fraction = 33.2%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.9%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 0.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.0%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.6%\n", "Estimated detectable doublet fraction = 42.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.8%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 1.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.7%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 2.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.7%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.9%\n", "Estimated detectable doublet fraction = 44.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.3%\n", "Automatically set threshold at doublet score = 0.30\n", "Detected doublet rate = 2.9%\n", "Estimated detectable doublet fraction = 44.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 6.5%\n", "Automatically set threshold at doublet score = 0.48\n", "Detected doublet rate = 0.5%\n", "Estimated detectable doublet fraction = 19.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.39\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 33.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.8%\n", "Estimated detectable doublet fraction = 40.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.4%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 1.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.0%\n", "Automatically set threshold at doublet score = 0.16\n", "Detected doublet rate = 7.6%\n", "Estimated detectable doublet fraction = 52.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 14.6%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 0.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 10.0%\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.3%\n", "Estimated detectable doublet fraction = 31.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.2%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.6%\n", "Estimated detectable doublet fraction = 44.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.7%\n", "Automatically set threshold at doublet score = 0.43\n", "Detected doublet rate = 0.9%\n", "Estimated detectable doublet fraction = 27.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.5%\n", "Automatically set threshold at doublet score = 0.33\n", "Detected doublet rate = 1.5%\n", "Estimated detectable doublet fraction = 43.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.4%\n", "Automatically set threshold at doublet score = 0.47\n", "Detected doublet rate = 0.7%\n", "Estimated detectable doublet fraction = 25.2%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.7%\n", "Automatically set threshold at doublet score = 0.44\n", "Detected doublet rate = 0.7%\n", "Estimated detectable doublet fraction = 25.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.8%\n", "Automatically set threshold at doublet score = 0.15\n", "Detected doublet rate = 8.6%\n", "Estimated detectable doublet fraction = 58.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 14.7%\n", "Automatically set threshold at doublet score = 0.16\n", "Detected doublet rate = 8.6%\n", "Estimated detectable doublet fraction = 55.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 15.5%\n", "Automatically set threshold at doublet score = 0.68\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.9%\n", "Automatically set threshold at doublet score = 0.25\n", "Detected doublet rate = 2.9%\n", "Estimated detectable doublet fraction = 47.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 6.0%\n", "Automatically set threshold at doublet score = 0.29\n", "Detected doublet rate = 1.8%\n", "Estimated detectable doublet fraction = 44.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.0%\n", "Automatically set threshold at doublet score = 0.15\n", "Detected doublet rate = 9.1%\n", "Estimated detectable doublet fraction = 58.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 15.5%\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 33.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.6%\n", "Automatically set threshold at doublet score = 0.33\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 40.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.26\n", "Detected doublet rate = 1.9%\n", "Estimated detectable doublet fraction = 54.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.5%\n", "Automatically set threshold at doublet score = 0.28\n", "Detected doublet rate = 2.2%\n", "Estimated detectable doublet fraction = 43.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.1%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 2.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.8%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 0.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.5%\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 32.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.2%\n", "Automatically set threshold at doublet score = 0.25\n", "Detected doublet rate = 2.3%\n", "Estimated detectable doublet fraction = 51.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.5%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.9%\n", "Estimated detectable doublet fraction = 46.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.0%\n", "Automatically set threshold at doublet score = 0.32\n", "Detected doublet rate = 1.4%\n", "Estimated detectable doublet fraction = 39.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.5%\n", "Automatically set threshold at doublet score = 0.69\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 8.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.7%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 1.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.1%\n", "Automatically set threshold at doublet score = 0.33\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 41.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.32\n", "Detected doublet rate = 1.5%\n", "Estimated detectable doublet fraction = 49.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.1%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.8%\n", "Estimated detectable doublet fraction = 45.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.9%\n", "Automatically set threshold at doublet score = 0.32\n", "Detected doublet rate = 1.7%\n", "Estimated detectable doublet fraction = 42.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.0%\n", "Automatically set threshold at doublet score = 0.38\n", "Detected doublet rate = 1.1%\n", "Estimated detectable doublet fraction = 32.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.2%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.6%\n", "Estimated detectable doublet fraction = 38.2%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.2%\n", "Automatically set threshold at doublet score = 0.47\n", "Detected doublet rate = 0.6%\n", "Estimated detectable doublet fraction = 21.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.3%\n", "Estimated detectable doublet fraction = 34.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.9%\n", "Automatically set threshold at doublet score = 0.38\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 28.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.7%\n", "Automatically set threshold at doublet score = 0.52\n", "Detected doublet rate = 0.3%\n", "Estimated detectable doublet fraction = 20.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.6%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.8%\n", "Estimated detectable doublet fraction = 44.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.2%\n", "Automatically set threshold at doublet score = 0.72\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.7%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.7%\n", "Estimated detectable doublet fraction = 41.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.3%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.34\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 42.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.67\n", "Detected doublet rate = 0.2%\n", "Estimated detectable doublet fraction = 8.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.2%\n", "Automatically set threshold at doublet score = 0.66\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 7.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.8%\n", "Automatically set threshold at doublet score = 0.45\n", "Detected doublet rate = 0.7%\n", "Estimated detectable doublet fraction = 20.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.4%\n", "Automatically set threshold at doublet score = 0.21\n", "Detected doublet rate = 3.7%\n", "Estimated detectable doublet fraction = 50.2%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 7.4%\n", "Automatically set threshold at doublet score = 0.32\n", "Detected doublet rate = 1.7%\n", "Estimated detectable doublet fraction = 36.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.6%\n", "Automatically set threshold at doublet score = 0.36\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 36.0%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.3%\n", "Automatically set threshold at doublet score = 0.29\n", "Detected doublet rate = 2.0%\n", "Estimated detectable doublet fraction = 44.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.5%\n", "Automatically set threshold at doublet score = 0.25\n", "Detected doublet rate = 2.5%\n", "Estimated detectable doublet fraction = 45.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.5%\n", "Automatically set threshold at doublet score = 0.28\n", "Detected doublet rate = 2.3%\n", "Estimated detectable doublet fraction = 45.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.0%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 2.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.8%\n", "Automatically set threshold at doublet score = 0.68\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.7%\n", "Automatically set threshold at doublet score = 0.38\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 31.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.3%\n", "Automatically set threshold at doublet score = 0.34\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 34.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.4%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.6%\n", "Estimated detectable doublet fraction = 33.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.7%\n", "Automatically set threshold at doublet score = 0.27\n", "Detected doublet rate = 2.3%\n", "Estimated detectable doublet fraction = 43.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.3%\n", "Automatically set threshold at doublet score = 0.40\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 32.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.1%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.3%\n", "Automatically set threshold at doublet score = 0.34\n", "Detected doublet rate = 1.4%\n", "Estimated detectable doublet fraction = 36.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.8%\n", "Automatically set threshold at doublet score = 0.33\n", "Detected doublet rate = 2.0%\n", "Estimated detectable doublet fraction = 40.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.0%\n", "Automatically set threshold at doublet score = 0.67\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 6.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.0%\n", "Automatically set threshold at doublet score = 0.42\n", "Detected doublet rate = 0.9%\n", "Estimated detectable doublet fraction = 31.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.8%\n", "Automatically set threshold at doublet score = 0.26\n", "Detected doublet rate = 2.7%\n", "Estimated detectable doublet fraction = 50.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.2%\n", "Automatically set threshold at doublet score = 0.47\n", "Detected doublet rate = 0.6%\n", "Estimated detectable doublet fraction = 19.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 2.0%\n", "Estimated detectable doublet fraction = 41.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.8%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 1.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.4%\n", "Automatically set threshold at doublet score = 0.45\n", "Detected doublet rate = 0.6%\n", "Estimated detectable doublet fraction = 18.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.2%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.5%\n", "Estimated detectable doublet fraction = 46.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.3%\n", "Automatically set threshold at doublet score = 0.63\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 4.2%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.0%\n", "Automatically set threshold at doublet score = 0.35\n", "Detected doublet rate = 1.2%\n", "Estimated detectable doublet fraction = 41.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.9%\n", "Automatically set threshold at doublet score = 0.27\n", "Detected doublet rate = 2.3%\n", "Estimated detectable doublet fraction = 42.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 5.4%\n", "Automatically set threshold at doublet score = 0.34\n", "Detected doublet rate = 1.0%\n", "Estimated detectable doublet fraction = 34.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.72\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 1.9%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.7%\n", "Automatically set threshold at doublet score = 0.40\n", "Detected doublet rate = 0.7%\n", "Estimated detectable doublet fraction = 25.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.8%\n", "Automatically set threshold at doublet score = 0.37\n", "Detected doublet rate = 1.3%\n", "Estimated detectable doublet fraction = 31.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.2%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.6%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.1%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 2.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.2%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 2.1%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.9%\n", "Automatically set threshold at doublet score = 0.71\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 1.4%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.2%\n", "Automatically set threshold at doublet score = 0.73\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 2.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 3.0%\n", "Automatically set threshold at doublet score = 0.69\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.7%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.9%\n", "Automatically set threshold at doublet score = 0.31\n", "Detected doublet rate = 1.8%\n", "Estimated detectable doublet fraction = 40.8%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 4.3%\n", "Automatically set threshold at doublet score = 0.68\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 3.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.6%\n", "Automatically set threshold at doublet score = 0.70\n", "Detected doublet rate = 0.1%\n", "Estimated detectable doublet fraction = 4.3%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 1.8%\n", "Automatically set threshold at doublet score = 0.73\n", "Detected doublet rate = 0.0%\n", "Estimated detectable doublet fraction = 1.5%\n", "Overall doublet rate:\n", "\tExpected = 5.0%\n", "\tEstimated = 2.6%\n" ] } ], "source": [ "from concurrent.futures import ThreadPoolExecutor\n", "\n", "results = []\n", "\n", "with ThreadPoolExecutor(max_workers=20) as executor: \n", " for result in executor.map(process_file, meta_data['file.path']):\n", " results.append(result)" ] }, { "cell_type": "code", "execution_count": 9, "id": "25d06af9-9857-4a99-88ce-83c0a1d99654", "metadata": { "tags": [] }, "outputs": [], "source": [ "final_result = pd.concat(results, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 10, "id": "53bb1f48-d0bc-41d4-a906-0c6ebc18568d", "metadata": { "tags": [] }, "outputs": [], "source": [ "final_result.to_parquet('doublet_score.parquet')" ] }, { "cell_type": "code", "execution_count": 11, "id": "2636730c-bbee-4a62-9b54-9b160a9bbf73", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "predicted_doublet\n", "False 2065817\n", "True 27261\n", "Name: count, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_result['predicted_doublet'].value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "id": "83f7adfb-8481-45aa-a729-5c196ee0c728", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "predicted_doublet\n", "False 2066388\n", "True 27399\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_result['predicted_doublet'].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "id": "2daea89e-8307-4b74-9064-2a1aba3c0294", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0.013085858303638336" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "27399/(2066388+27399)" ] }, { "cell_type": "code", "execution_count": null, "id": "cdb4af83-9148-4c73-a229-274fcf777fb1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }