{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scanpy as sc\n", "from scipy import stats\n", "import os\n", "from sklearn.cluster import KMeans\n", "from sklearn.cluster import AgglomerativeClustering\n", "from sklearn.metrics.cluster import adjusted_rand_score\n", "from sklearn.metrics.cluster import adjusted_mutual_info_score\n", "from sklearn.metrics.cluster import homogeneity_score\n", "import rpy2.robjects as robjects\n", "from rpy2.robjects import pandas2ri" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',\n", "# 'AMI_Louvain','AMI_kmeans','AMI_HC',\n", "# 'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "workdir = './output/'\n", "path_fm = os.path.join(workdir,'feature_matrices/')\n", "path_clusters = os.path.join(workdir,'clusters/')\n", "path_metrics = os.path.join(workdir,'metrics/')\n", "os.system('mkdir -p '+path_clusters)\n", "os.system('mkdir -p '+path_metrics)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\n" ] } ], "source": [ "metadata = pd.read_csv('./input/metadata.tsv',sep='\\t',index_col=0)\n", "num_clusters = len(np.unique(metadata['label']))\n", "print(num_clusters)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files = [x for x in os.listdir(path_fm) if x.startswith('FM')]\n", "len(files)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['FM_ChromVAR_10xpbmc5k_motifs.rds',\n", " 'FM_ChromVAR_10xpbmc5k_kmers.rds',\n", " 'FM_cisTopic_10xpbmc5k.rds',\n", " 'FM_SnapATAC_10xpbmc5k.rds',\n", " 'FM_BROCKMAN_10xpbmc5k.rds',\n", " 'FM_SCRAT_10xpbmc5k_motifs.rds',\n", " 'FM_Cusanovich2018_10xpbmc5k.rds',\n", " 'FM_Control_10xpbmc5k.rds',\n", " 'FM_GeneScoring_10xpbmc5k.rds',\n", " 'FM_Scasat_10xpbmc5k.rds',\n", " 'FM_scABC_10xpbmc5k.rds',\n", " 'FM_Cicero_10xpbmc5k.rds',\n", " 'FM_ChromVAR_10xpbmc_kmers_pca.rds',\n", " 'FM_ChromVAR_10xpbmc_motifs_pca.rds',\n", " 'FM_GeneScoring_10xpbmc_pca.rds',\n", " 'FM_Cicero_10xpbmc_pca.rds',\n", " 'FM_SCRAT_10xpbmc_pca.rds']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):\n", " this_step = 0\n", " this_min = float(range_min)\n", " this_max = float(range_max)\n", " while this_step < max_steps:\n", " print('step ' + str(this_step))\n", " this_resolution = this_min + ((this_max-this_min)/2)\n", " sc.tl.louvain(adata,resolution=this_resolution)\n", " this_clusters = adata.obs['louvain'].nunique()\n", " \n", " print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))\n", " \n", " if this_clusters > n_cluster:\n", " this_max = this_resolution\n", " elif this_clusters < n_cluster:\n", " this_min = this_resolution\n", " else:\n", " return(this_resolution, adata)\n", " this_step += 1\n", " \n", " print('Cannot find the number of clusters')\n", " print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ChromVAR_motifs\n", "step 0\n", "got 8 at resolution 1.5\n", "ChromVAR_kmers\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 6 at resolution 1.5\n", "step 1\n", "got 17 at resolution 2.25\n", "step 2\n", "got 9 at resolution 1.875\n", "step 3\n", "got 6 at resolution 1.6875\n", "step 4\n", "got 7 at resolution 1.78125\n", "step 5\n", "got 8 at resolution 1.828125\n", "cisTopic\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 17 at resolution 1.5\n", "step 1\n", "got 14 at resolution 0.75\n", "step 2\n", "got 9 at resolution 0.375\n", "step 3\n", "got 6 at resolution 0.1875\n", "step 4\n", "got 8 at resolution 0.28125\n", "SnapATAC\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 19 at resolution 1.5\n", "step 1\n", "got 14 at resolution 0.75\n", "step 2\n", "got 11 at resolution 0.375\n", "step 3\n", "got 6 at resolution 0.1875\n", "step 4\n", "got 8 at resolution 0.28125\n", "BROCKMAN\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 20 at resolution 1.5\n", "step 1\n", "got 13 at resolution 0.75\n", "step 2\n", "got 8 at resolution 0.375\n", "SCRAT_motifs\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 19 at resolution 1.5\n", "step 1\n", "got 14 at resolution 0.75\n", "step 2\n", "got 8 at resolution 0.375\n", "Cusanovich2018\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 17 at resolution 1.5\n", "step 1\n", "got 11 at resolution 0.75\n", "step 2\n", "got 8 at resolution 0.375\n", "Control\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 22 at resolution 1.5\n", "step 1\n", "got 13 at resolution 0.75\n", "step 2\n", "got 7 at resolution 0.375\n", "step 3\n", "got 10 at resolution 0.5625\n", "step 4\n", "got 9 at resolution 0.46875\n", "step 5\n", "got 8 at resolution 0.421875\n", "GeneScoring\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 8 at resolution 1.5\n", "Scasat\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 25 at resolution 1.5\n", "step 1\n", "got 17 at resolution 0.75\n", "step 2\n", "got 11 at resolution 0.375\n", "step 3\n", "got 6 at resolution 0.1875\n", "step 4\n", "got 9 at resolution 0.28125\n", "step 5\n", "got 6 at resolution 0.234375\n", "step 6\n", "got 6 at resolution 0.2578125\n", "step 7\n", "got 8 at resolution 0.26953125\n", "scABC\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 10 at resolution 1.5\n", "step 1\n", "got 5 at resolution 0.75\n", "step 2\n", "got 7 at resolution 1.125\n", "step 3\n", "got 10 at resolution 1.3125\n", "step 4\n", "got 8 at resolution 1.21875\n", "Cicero\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 8 at resolution 1.5\n", "ChromVAR_kmers_pca\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 10 at resolution 1.5\n", "step 1\n", "got 6 at resolution 0.75\n", "step 2\n", "got 8 at resolution 1.125\n", "ChromVAR_motifs_pca\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 9 at resolution 1.5\n", "step 1\n", "got 6 at resolution 0.75\n", "step 2\n", "got 7 at resolution 1.125\n", "step 3\n", "got 7 at resolution 1.3125\n", "step 4\n", "got 8 at resolution 1.40625\n", "GeneScoring_pca\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 15 at resolution 1.5\n", "step 1\n", "got 9 at resolution 0.75\n", "step 2\n", "got 7 at resolution 0.375\n", "step 3\n", "got 7 at resolution 0.5625\n", "step 4\n", "got 7 at resolution 0.65625\n", "step 5\n", "got 7 at resolution 0.703125\n", "step 6\n", "got 9 at resolution 0.7265625\n", "step 7\n", "got 7 at resolution 0.71484375\n", "step 8\n", "got 8 at resolution 0.720703125\n", "Cicero_pca\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 12 at resolution 1.5\n", "step 1\n", "got 5 at resolution 0.75\n", "step 2\n", "got 11 at resolution 1.125\n", "step 3\n", "got 7 at resolution 0.9375\n", "step 4\n", "got 10 at resolution 1.03125\n", "step 5\n", "got 7 at resolution 0.984375\n", "step 6\n", "got 9 at resolution 1.0078125\n", "step 7\n", "got 8 at resolution 0.99609375\n", "SCRAT_pca\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", " res = PandasDataFrame.from_items(items)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "step 0\n", "got 23 at resolution 1.5\n", "step 1\n", "got 16 at resolution 0.75\n", "step 2\n", "got 11 at resolution 0.375\n", "step 3\n", "got 6 at resolution 0.1875\n", "step 4\n", "got 9 at resolution 0.28125\n", "step 5\n", "got 7 at resolution 0.234375\n", "step 6\n", "got 8 at resolution 0.2578125\n" ] } ], "source": [ "for file in files:\n", " file_split = file.split('_')\n", " method = file_split[1]\n", " dataset = file_split[2].split('.')[0]\n", " if(len(file_split)>3):\n", " method = method + '_' + '_'.join(file_split[3:]).split('.')[0]\n", " print(method)\n", "\n", " pandas2ri.activate()\n", " readRDS = robjects.r['readRDS']\n", " df_rds = readRDS(os.path.join(path_fm,file))\n", " fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))\n", " fm_mat.fillna(0,inplace=True)\n", " fm_mat.columns = metadata.index\n", " \n", " adata = sc.AnnData(fm_mat.T)\n", " adata.var_names_make_unique()\n", " adata.obs = metadata.loc[adata.obs.index,]\n", "# df_metrics.loc[method,] = \"\"\n", " #Louvain\n", " sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')\n", "# sc.tl.louvain(adata)\n", " getNClusters(adata,n_cluster=num_clusters)\n", " #kmeans\n", " kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)\n", " adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')\n", " #hierachical clustering\n", " hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)\n", " adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')\n", "# #clustering metrics\n", " \n", "# #adjusted rank index\n", "# ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])\n", "# ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])\n", "# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])\n", "# #adjusted mutual information\n", "# ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')\n", "# ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic') \n", "# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')\n", "# #homogeneity\n", "# homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])\n", "# homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])\n", "# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])\n", "\n", "# df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]\n", "# df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]\n", "# df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] \n", " adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def residual_average_gini_index(gene_scores,folder_clusters,\n", " housekeeping_genes,marker_genes,\n", " min_cells_per_cluster=10):\n", "\n", " #Subset from the main matrix the housekeeping genes and marker genes\n", " df_matrix_housekeeping=gene_scores.loc[gene_scores.index.intersection(housekeeping_genes),]\n", " df_matrix_marker=gene_scores.loc[gene_scores.index.intersection(marker_genes),]\n", " \n", " #Define a function to compute the Gini score\n", " def gini(list_of_values):\n", " sorted_list = sorted(list_of_values)\n", " height, area = 0, 0\n", " for value in sorted_list:\n", " height += value\n", " area += height - value / 2.\n", " fair_area = height * len(list_of_values) / 2.\n", " return (fair_area - area) / fair_area\n", " \n", " #Function to calculate Gini value for all the genes\n", " def calculate_gini(df_matrix, gene_name,clustering_info):\n", " return gini(get_avg_per_cluster(df_matrix,gene_name,clustering_info,use_log2=False))\n", "\n", " #Function to calculate Gini value for all the genes\n", " def calculate_gini_values(df_matrix,clustering_info):\n", " gini_values=[]\n", " for gene_name in df_matrix.index:\n", " gini_values.append(calculate_gini(df_matrix, gene_name,clustering_info))\n", " return gini_values\n", " \n", " #Write a function to compute delta difference of the average accessibility in Marker vs Housekeeping and Kolmogorov Smirnov test\n", " def score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info):\n", " gini_values_housekeeping=calculate_gini_values(df_matrix_housekeeping,clustering_info)\n", " gini_values_marker=calculate_gini_values(df_matrix_marker,clustering_info)\n", " statistic,p_value=stats.ks_2samp(gini_values_marker,gini_values_housekeeping)\n", " \n", " return np.mean(gini_values_marker), np.mean(gini_values_housekeeping),np.mean(gini_values_marker)-np.mean(gini_values_housekeeping), statistic,p_value\n", "\n", " #Function to compute the average accessibility value per cluster\n", " def get_avg_per_cluster(df_matrix, gene_name, clustering_info,use_log2=False):\n", " N_clusters=len(clustering_info.index.unique())\n", " avg_per_cluster=np.zeros(N_clusters)\n", " for idx,idx_cluster in enumerate(sorted(np.unique(clustering_info.index.unique()))):\n", " if use_log2:\n", " values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()].apply(lambda x:np.log2(x+1))\n", " else:\n", " values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()]\n", " \n", " avg_per_cluster[idx]=values_cluster.mean()\n", " if avg_per_cluster[idx]>0:\n", " avg_per_cluster[idx]=avg_per_cluster[idx]#/values_cluster.std()\n", "\n", " return avg_per_cluster\n", " \n", "\n", " #Run the method for all the clustering solutions\n", " \n", " df_metrics = pd.DataFrame(columns=['Method','Clustering','Gini_Marker_Genes','Gini_Housekeeping_Genes','Difference','KS_statistics','p-value']) \n", " \n", " for clusters_filename in os.listdir(folder_clusters): \n", " method = '_'.join(clusters_filename.split('_')[:-1])\n", " print(method)\n", " df_clusters = pd.read_csv(os.path.join(folder_clusters,clusters_filename),sep='\\t',index_col=0)\n", " for clustering_method in df_clusters.columns:\n", " clustering_info = pd.DataFrame(df_clusters[clustering_method])\n", " clustering_info['Barcode'] = clustering_info.index\n", " clustering_info=clustering_info.set_index(clustering_method)\n", " \n", " #REMOVE CLUSTERS WITH FEW CELLS\n", " cluster_sizes=pd.value_counts(clustering_info.index)\n", " clustering_info=clustering_info.loc[cluster_sizes[cluster_sizes>min_cells_per_cluster].index.values,:]\n", "\n", "\n", " mean_gini_marker,mean_gini_housekeeping,mean_gini_difference,statistics,p_value=score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info)\n", "\n", " df_metrics = df_metrics.append({'Method': method,'Clustering':clustering_method,\n", " 'Gini_Marker_Genes':mean_gini_marker,'Gini_Housekeeping_Genes':mean_gini_housekeeping,\n", " 'Difference':mean_gini_difference,'KS_statistics':statistics,'p-value':p_value},\n", " ignore_index=True) \n", " \n", " return df_metrics" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "gene_scores = pd.read_csv('./run_methods/GeneScoring/FM_GeneScoring_10xpbmc5k.tsv',\n", " sep = '\\t',index_col=0)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | AAACGAAAGCGCAATG-1 | \n", "AAACGAAAGGGTATCG-1 | \n", "AAACGAAAGTAACATG-1 | \n", "AAACGAAAGTTACACC-1 | \n", "AAACGAACAGAGATGC-1 | \n", "AAACGAACATGCTATG-1 | \n", "AAACGAAGTGCATCAT-1 | \n", "AAACGAAGTGGACGAT-1 | \n", "AAACGAAGTGGCCTCA-1 | \n", "AAACGAATCAGTGTAC-1 | \n", "... | \n", "TTTGGTTGTCAGAAAT-1 | \n", "TTTGGTTGTTGTATCG-1 | \n", "TTTGGTTTCAGTGGTT-1 | \n", "TTTGTGTAGGAAACTT-1 | \n", "TTTGTGTCAAGCCTTA-1 | \n", "TTTGTGTCACTCAAGT-1 | \n", "TTTGTGTCACTGGGCT-1 | \n", "TTTGTGTGTACGCAAG-1 | \n", "TTTGTGTGTCTGCGCA-1 | \n", "TTTGTGTTCAACTTGG-1 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A1BG | \n", "0.086201 | \n", "1.775694 | \n", "3.591624 | \n", "0.212587 | \n", "0.040847 | \n", "0.004313 | \n", "0.002157 | \n", "0.176512 | \n", "0.000518 | \n", "0.007131 | \n", "... | \n", "0.041391 | \n", "3.588490 | \n", "1.822400 | \n", "0.045678 | \n", "5.371366 | \n", "0.002675 | \n", "2.454377 | \n", "1.984945 | \n", "1.732547 | \n", "1.730391 | \n", "
A1BG-AS1 | \n", "0.458201 | \n", "0.478351 | \n", "1.365454 | \n", "0.930858 | \n", "0.018763 | \n", "0.023040 | \n", "0.011713 | \n", "0.943315 | \n", "0.000097 | \n", "0.038674 | \n", "... | \n", "0.011158 | \n", "1.345947 | \n", "0.922146 | \n", "0.042287 | \n", "2.255818 | \n", "0.012003 | \n", "3.702715 | \n", "1.393866 | \n", "0.444438 | \n", "0.432918 | \n", "
A2LD1 | \n", "0.000363 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
A2M | \n", "0.000000 | \n", "0.054960 | \n", "0.000665 | \n", "0.000000 | \n", "2.205809 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.007216 | \n", "2.258926 | \n", "0.000000 | \n", "0.000819 | \n", "0.000000 | \n", "0.001228 | \n", "0.000665 | \n", "0.000000 | \n", "0.000000 | \n", "
A2M-AS1 | \n", "0.000000 | \n", "0.051265 | \n", "0.000621 | \n", "0.000000 | \n", "2.057506 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "... | \n", "0.000000 | \n", "0.006731 | \n", "2.107052 | \n", "0.000000 | \n", "0.000764 | \n", "0.000000 | \n", "0.001146 | \n", "0.000621 | \n", "0.000000 | \n", "0.000000 | \n", "
5 rows × 5335 columns
\n", "\n", " | Method | \n", "Clustering | \n", "Gini_Marker_Genes | \n", "Gini_Housekeeping_Genes | \n", "Difference | \n", "KS_statistics | \n", "p-value | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "ChromVAR_motifs | \n", "louvain | \n", "0.329794 | \n", "0.202917 | \n", "0.126878 | \n", "0.535714 | \n", "0.001101 | \n", "
1 | \n", "ChromVAR_motifs | \n", "kmeans | \n", "0.343745 | \n", "0.237399 | \n", "0.106345 | \n", "0.464286 | \n", "0.007129 | \n", "
2 | \n", "ChromVAR_motifs | \n", "hc | \n", "0.381168 | \n", "0.290041 | \n", "0.091127 | \n", "0.500000 | \n", "0.002897 | \n", "
3 | \n", "ChromVAR_kmers | \n", "louvain | \n", "0.309164 | \n", "0.178850 | \n", "0.130315 | \n", "0.535714 | \n", "0.001101 | \n", "
4 | \n", "ChromVAR_kmers | \n", "kmeans | \n", "0.340088 | \n", "0.263170 | \n", "0.076918 | \n", "0.488095 | \n", "0.003940 | \n", "
5 | \n", "ChromVAR_kmers | \n", "hc | \n", "0.319202 | \n", "0.241089 | \n", "0.078113 | \n", "0.523810 | \n", "0.001532 | \n", "
6 | \n", "cisTopic | \n", "louvain | \n", "0.369119 | \n", "0.254688 | \n", "0.114431 | \n", "0.571429 | \n", "0.000392 | \n", "
7 | \n", "cisTopic | \n", "kmeans | \n", "0.417319 | \n", "0.316060 | \n", "0.101259 | \n", "0.500000 | \n", "0.002897 | \n", "
8 | \n", "cisTopic | \n", "hc | \n", "0.468534 | \n", "0.362493 | \n", "0.106041 | \n", "0.500000 | \n", "0.002897 | \n", "
9 | \n", "SnapATAC | \n", "louvain | \n", "0.332478 | \n", "0.195798 | \n", "0.136680 | \n", "0.452381 | \n", "0.009484 | \n", "
10 | \n", "SnapATAC | \n", "kmeans | \n", "0.361506 | \n", "0.217868 | \n", "0.143637 | \n", "0.571429 | \n", "0.000392 | \n", "
11 | \n", "SnapATAC | \n", "hc | \n", "0.371438 | \n", "0.231429 | \n", "0.140009 | \n", "0.523810 | \n", "0.001532 | \n", "
12 | \n", "BROCKMAN | \n", "louvain | \n", "0.346999 | \n", "0.234125 | \n", "0.112874 | \n", "0.476190 | \n", "0.005320 | \n", "
13 | \n", "BROCKMAN | \n", "kmeans | \n", "0.327789 | \n", "0.262820 | \n", "0.064969 | \n", "0.392857 | \n", "0.035346 | \n", "
14 | \n", "BROCKMAN | \n", "hc | \n", "0.324061 | \n", "0.255878 | \n", "0.068183 | \n", "0.464286 | \n", "0.007129 | \n", "
15 | \n", "SCRAT_motifs | \n", "louvain | \n", "0.347878 | \n", "0.293645 | \n", "0.054233 | \n", "0.476190 | \n", "0.005320 | \n", "
16 | \n", "SCRAT_motifs | \n", "kmeans | \n", "0.287355 | \n", "0.234507 | \n", "0.052848 | \n", "0.392857 | \n", "0.035346 | \n", "
17 | \n", "SCRAT_motifs | \n", "hc | \n", "0.315229 | \n", "0.263967 | \n", "0.051262 | \n", "0.416667 | \n", "0.021353 | \n", "
18 | \n", "Cusanovich2018 | \n", "louvain | \n", "0.388897 | \n", "0.280957 | \n", "0.107940 | \n", "0.464286 | \n", "0.007129 | \n", "
19 | \n", "Cusanovich2018 | \n", "kmeans | \n", "0.254027 | \n", "0.177305 | \n", "0.076722 | \n", "0.464286 | \n", "0.007129 | \n", "
20 | \n", "Cusanovich2018 | \n", "hc | \n", "0.225468 | \n", "0.141125 | \n", "0.084343 | \n", "0.464286 | \n", "0.007129 | \n", "
21 | \n", "Control | \n", "louvain | \n", "0.443971 | \n", "0.358864 | \n", "0.085107 | \n", "0.464286 | \n", "0.007129 | \n", "
22 | \n", "Control | \n", "kmeans | \n", "0.456480 | \n", "0.453863 | \n", "0.002617 | \n", "0.273810 | \n", "0.280812 | \n", "
23 | \n", "Control | \n", "hc | \n", "0.471765 | \n", "0.467423 | \n", "0.004342 | \n", "0.238095 | \n", "0.448886 | \n", "
24 | \n", "GeneScoring | \n", "louvain | \n", "0.460154 | \n", "0.389402 | \n", "0.070752 | \n", "0.428571 | \n", "0.016413 | \n", "
25 | \n", "GeneScoring | \n", "kmeans | \n", "0.450160 | \n", "0.471894 | \n", "-0.021734 | \n", "0.297619 | \n", "0.197109 | \n", "
26 | \n", "GeneScoring | \n", "hc | \n", "0.503348 | \n", "0.501473 | \n", "0.001875 | \n", "0.285714 | \n", "0.236193 | \n", "
27 | \n", "Scasat | \n", "louvain | \n", "0.472445 | \n", "0.374599 | \n", "0.097846 | \n", "0.500000 | \n", "0.002897 | \n", "
28 | \n", "Scasat | \n", "kmeans | \n", "0.497129 | \n", "0.421512 | \n", "0.075617 | \n", "0.428571 | \n", "0.016413 | \n", "
29 | \n", "Scasat | \n", "hc | \n", "0.477655 | \n", "0.399098 | \n", "0.078557 | \n", "0.464286 | \n", "0.007129 | \n", "
30 | \n", "scABC | \n", "louvain | \n", "0.461698 | \n", "0.441462 | \n", "0.020237 | \n", "0.297619 | \n", "0.197109 | \n", "
31 | \n", "scABC | \n", "kmeans | \n", "0.458673 | \n", "0.454737 | \n", "0.003937 | \n", "0.238095 | \n", "0.448886 | \n", "
32 | \n", "scABC | \n", "hc | \n", "0.425721 | \n", "0.396330 | \n", "0.029391 | \n", "0.321429 | \n", "0.134155 | \n", "
33 | \n", "Cicero | \n", "louvain | \n", "0.354993 | \n", "0.249562 | \n", "0.105431 | \n", "0.547619 | \n", "0.000786 | \n", "
34 | \n", "Cicero | \n", "kmeans | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
35 | \n", "Cicero | \n", "hc | \n", "0.264151 | \n", "0.203767 | \n", "0.060383 | \n", "0.357143 | \n", "0.071204 | \n", "
36 | \n", "ChromVAR_kmers_pca | \n", "louvain | \n", "0.352224 | \n", "0.232139 | \n", "0.120086 | \n", "0.511905 | \n", "0.002114 | \n", "
37 | \n", "ChromVAR_kmers_pca | \n", "kmeans | \n", "0.287473 | \n", "0.227681 | \n", "0.059792 | \n", "0.488095 | \n", "0.003940 | \n", "
38 | \n", "ChromVAR_kmers_pca | \n", "hc | \n", "0.322012 | \n", "0.246821 | \n", "0.075191 | \n", "0.464286 | \n", "0.007129 | \n", "
39 | \n", "ChromVAR_motifs_pca | \n", "louvain | \n", "0.333697 | \n", "0.205199 | \n", "0.128498 | \n", "0.535714 | \n", "0.001101 | \n", "
40 | \n", "ChromVAR_motifs_pca | \n", "kmeans | \n", "0.396433 | \n", "0.301102 | \n", "0.095331 | \n", "0.428571 | \n", "0.016413 | \n", "
41 | \n", "ChromVAR_motifs_pca | \n", "hc | \n", "0.397484 | \n", "0.301730 | \n", "0.095755 | \n", "0.440476 | \n", "0.012522 | \n", "
42 | \n", "GeneScoring_pca | \n", "louvain | \n", "0.517697 | \n", "0.503202 | \n", "0.014495 | \n", "0.261905 | \n", "0.331171 | \n", "
43 | \n", "GeneScoring_pca | \n", "kmeans | \n", "0.488294 | \n", "0.499929 | \n", "-0.011635 | \n", "0.333333 | \n", "0.109433 | \n", "
44 | \n", "GeneScoring_pca | \n", "hc | \n", "0.469333 | \n", "0.473566 | \n", "-0.004233 | \n", "0.285714 | \n", "0.236193 | \n", "
45 | \n", "Cicero_pca | \n", "louvain | \n", "0.470227 | \n", "0.356027 | \n", "0.114200 | \n", "0.488095 | \n", "0.003940 | \n", "
46 | \n", "Cicero_pca | \n", "kmeans | \n", "0.262221 | \n", "0.197641 | \n", "0.064580 | \n", "0.357143 | \n", "0.071204 | \n", "
47 | \n", "Cicero_pca | \n", "hc | \n", "0.250496 | \n", "0.172051 | \n", "0.078446 | \n", "0.464286 | \n", "0.007129 | \n", "
48 | \n", "SCRAT_pca | \n", "louvain | \n", "0.392071 | \n", "0.320236 | \n", "0.071835 | \n", "0.464286 | \n", "0.007129 | \n", "
49 | \n", "SCRAT_pca | \n", "kmeans | \n", "0.287531 | \n", "0.234164 | \n", "0.053367 | \n", "0.392857 | \n", "0.035346 | \n", "
50 | \n", "SCRAT_pca | \n", "hc | \n", "0.278123 | \n", "0.232857 | \n", "0.045266 | \n", "0.404762 | \n", "0.027574 | \n", "