{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3f29b888-622d-41b4-838e-fe977defc551", "metadata": { "tags": [] }, "outputs": [], "source": [ "import celltypist\n", "from celltypist import models\n", "import scanpy as sc\n", "import pandas as pd \n", "import numpy as np\n", "import anndata\n", "import re\n", "import h5py\n", "import scipy.sparse as scs\n", "import concurrent.futures" ] }, { "cell_type": "code", "execution_count": 2, "id": "80c98f5b-1226-4549-b360-50901a977a8b", "metadata": { "tags": [] }, "outputs": [], "source": [ "def read_mat(h5_con):\n", " mat = scs.csc_matrix(\n", " (h5_con['matrix']['data'][:], # Count values\n", " h5_con['matrix']['indices'][:], # Row indices\n", " h5_con['matrix']['indptr'][:]), # Pointers for column positions\n", " shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions\n", " )\n", " return mat\n", "\n", "# define a function to obeservation (i.e. metadata)\n", "\n", "def read_obs(h5con):\n", " bc = h5con['matrix']['barcodes'][:]\n", " bc = [x.decode('UTF-8') for x in bc]\n", "\n", " # Initialized the DataFrame with cell barcodes\n", " obs_df = pd.DataFrame({ 'barcodes' : bc })\n", "\n", " # Get the list of available metadata columns\n", " obs_columns = h5con['matrix']['observations'].keys()\n", "\n", " # For each column\n", " for col in obs_columns:\n", " # Read the values\n", " values = h5con['matrix']['observations'][col][:]\n", " # Check for byte storage\n", " if(isinstance(values[0], (bytes, bytearray))):\n", " # Decode byte strings\n", " values = [x.decode('UTF-8') for x in values]\n", " # Add column to the DataFrame\n", " obs_df[col] = values\n", " \n", " return obs_df\n", "# define a function to construct anndata object from a h5 file\n", "def read_h5_anndata(h5_file):\n", " h5_con = h5py.File(h5_file, mode = 'r')\n", " # extract the expression matrix\n", " mat = read_mat(h5_con)\n", " # extract gene names\n", " genes = h5_con['matrix']['features']['name'][:]\n", " genes = [x.decode('UTF-8') for x in genes]\n", " # extract metadata\n", " obs_df = read_obs(h5_con)\n", " # construct anndata\n", " adata = anndata.AnnData(mat.T,\n", " obs = obs_df)\n", " # make sure the gene names aligned\n", " adata.var_names = genes\n", "\n", " adata.var_names_make_unique()\n", " return adata" ] }, { "cell_type": "code", "execution_count": 3, "id": "c18be489-3d24-4206-9a30-38c63adfb302", "metadata": { "tags": [] }, "outputs": [], "source": [ "meta_data=pd.read_csv(\"hise_meta_data_2023-11-19.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "cf38a407-98f4-417a-a2df-2cb4df74f37b", "metadata": { "tags": [] }, "outputs": [], "source": [ "results = []\n", "for file_name in meta_data['file.path']:\n", " result = read_h5_anndata(file_name)\n", " results.append(result)\n", "adata = anndata.concat(results)" ] }, { "cell_type": "code", "execution_count": 5, "id": "ef9c2838-e688-444c-aef3-ce3e6cc980dd", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "... storing 'batch_id' as categorical\n", "... storing 'cell_name' as categorical\n", "... storing 'chip_id' as categorical\n", "... storing 'hto_barcode' as categorical\n", "... storing 'hto_category' as categorical\n", "... storing 'original_barcodes' as categorical\n", "... storing 'pbmc_sample_id' as categorical\n", "... storing 'pool_id' as categorical\n", "... storing 'seurat_pbmc_type' as categorical\n", "... storing 'well_id' as categorical\n" ] } ], "source": [ "adata.write_h5ad('adata_all_raw.h5ad')" ] }, { "cell_type": "code", "execution_count": null, "id": "9a681faf-5c56-42c5-8fb1-7987ac2384cf", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }