{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scanpy as sc\n",
    "from scipy import stats\n",
    "import os\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "from sklearn.metrics.cluster import adjusted_rand_score\n",
    "from sklearn.metrics.cluster import adjusted_mutual_info_score\n",
    "from sklearn.metrics.cluster import homogeneity_score\n",
    "import rpy2.robjects as robjects\n",
    "from rpy2.robjects import pandas2ri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',\n",
    "#                                    'AMI_Louvain','AMI_kmeans','AMI_HC',\n",
    "#                                    'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "workdir = './output/'\n",
    "path_fm = os.path.join(workdir,'feature_matrices/')\n",
    "path_clusters = os.path.join(workdir,'clusters/')\n",
    "path_metrics = os.path.join(workdir,'metrics/')\n",
    "os.system('mkdir -p '+path_clusters)\n",
    "os.system('mkdir -p '+path_metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8\n"
     ]
    }
   ],
   "source": [
    "metadata = pd.read_csv('./input/metadata.tsv',sep='\\t',index_col=0)\n",
    "num_clusters = len(np.unique(metadata['label']))\n",
    "print(num_clusters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = [x for x in os.listdir(path_fm) if x.startswith('FM')]\n",
    "len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['FM_ChromVAR_10xpbmc5k_motifs.rds',\n",
       " 'FM_ChromVAR_10xpbmc5k_kmers.rds',\n",
       " 'FM_cisTopic_10xpbmc5k.rds',\n",
       " 'FM_SnapATAC_10xpbmc5k.rds',\n",
       " 'FM_BROCKMAN_10xpbmc5k.rds',\n",
       " 'FM_SCRAT_10xpbmc5k_motifs.rds',\n",
       " 'FM_Cusanovich2018_10xpbmc5k.rds',\n",
       " 'FM_Control_10xpbmc5k.rds',\n",
       " 'FM_GeneScoring_10xpbmc5k.rds',\n",
       " 'FM_Scasat_10xpbmc5k.rds',\n",
       " 'FM_scABC_10xpbmc5k.rds',\n",
       " 'FM_Cicero_10xpbmc5k.rds',\n",
       " 'FM_ChromVAR_10xpbmc_kmers_pca.rds',\n",
       " 'FM_ChromVAR_10xpbmc_motifs_pca.rds',\n",
       " 'FM_GeneScoring_10xpbmc_pca.rds',\n",
       " 'FM_Cicero_10xpbmc_pca.rds',\n",
       " 'FM_SCRAT_10xpbmc_pca.rds']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):\n",
    "    this_step = 0\n",
    "    this_min = float(range_min)\n",
    "    this_max = float(range_max)\n",
    "    while this_step < max_steps:\n",
    "        print('step ' + str(this_step))\n",
    "        this_resolution = this_min + ((this_max-this_min)/2)\n",
    "        sc.tl.louvain(adata,resolution=this_resolution)\n",
    "        this_clusters = adata.obs['louvain'].nunique()\n",
    "        \n",
    "        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))\n",
    "        \n",
    "        if this_clusters > n_cluster:\n",
    "            this_max = this_resolution\n",
    "        elif this_clusters < n_cluster:\n",
    "            this_min = this_resolution\n",
    "        else:\n",
    "            return(this_resolution, adata)\n",
    "        this_step += 1\n",
    "    \n",
    "    print('Cannot find the number of clusters')\n",
    "    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChromVAR_motifs\n",
      "step 0\n",
      "got 8 at resolution 1.5\n",
      "ChromVAR_kmers\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 6 at resolution 1.5\n",
      "step 1\n",
      "got 17 at resolution 2.25\n",
      "step 2\n",
      "got 9 at resolution 1.875\n",
      "step 3\n",
      "got 6 at resolution 1.6875\n",
      "step 4\n",
      "got 7 at resolution 1.78125\n",
      "step 5\n",
      "got 8 at resolution 1.828125\n",
      "cisTopic\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 17 at resolution 1.5\n",
      "step 1\n",
      "got 14 at resolution 0.75\n",
      "step 2\n",
      "got 9 at resolution 0.375\n",
      "step 3\n",
      "got 6 at resolution 0.1875\n",
      "step 4\n",
      "got 8 at resolution 0.28125\n",
      "SnapATAC\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 19 at resolution 1.5\n",
      "step 1\n",
      "got 14 at resolution 0.75\n",
      "step 2\n",
      "got 11 at resolution 0.375\n",
      "step 3\n",
      "got 6 at resolution 0.1875\n",
      "step 4\n",
      "got 8 at resolution 0.28125\n",
      "BROCKMAN\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 20 at resolution 1.5\n",
      "step 1\n",
      "got 13 at resolution 0.75\n",
      "step 2\n",
      "got 8 at resolution 0.375\n",
      "SCRAT_motifs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 19 at resolution 1.5\n",
      "step 1\n",
      "got 14 at resolution 0.75\n",
      "step 2\n",
      "got 8 at resolution 0.375\n",
      "Cusanovich2018\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 17 at resolution 1.5\n",
      "step 1\n",
      "got 11 at resolution 0.75\n",
      "step 2\n",
      "got 8 at resolution 0.375\n",
      "Control\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 22 at resolution 1.5\n",
      "step 1\n",
      "got 13 at resolution 0.75\n",
      "step 2\n",
      "got 7 at resolution 0.375\n",
      "step 3\n",
      "got 10 at resolution 0.5625\n",
      "step 4\n",
      "got 9 at resolution 0.46875\n",
      "step 5\n",
      "got 8 at resolution 0.421875\n",
      "GeneScoring\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 8 at resolution 1.5\n",
      "Scasat\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 25 at resolution 1.5\n",
      "step 1\n",
      "got 17 at resolution 0.75\n",
      "step 2\n",
      "got 11 at resolution 0.375\n",
      "step 3\n",
      "got 6 at resolution 0.1875\n",
      "step 4\n",
      "got 9 at resolution 0.28125\n",
      "step 5\n",
      "got 6 at resolution 0.234375\n",
      "step 6\n",
      "got 6 at resolution 0.2578125\n",
      "step 7\n",
      "got 8 at resolution 0.26953125\n",
      "scABC\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 10 at resolution 1.5\n",
      "step 1\n",
      "got 5 at resolution 0.75\n",
      "step 2\n",
      "got 7 at resolution 1.125\n",
      "step 3\n",
      "got 10 at resolution 1.3125\n",
      "step 4\n",
      "got 8 at resolution 1.21875\n",
      "Cicero\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 8 at resolution 1.5\n",
      "ChromVAR_kmers_pca\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 10 at resolution 1.5\n",
      "step 1\n",
      "got 6 at resolution 0.75\n",
      "step 2\n",
      "got 8 at resolution 1.125\n",
      "ChromVAR_motifs_pca\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 9 at resolution 1.5\n",
      "step 1\n",
      "got 6 at resolution 0.75\n",
      "step 2\n",
      "got 7 at resolution 1.125\n",
      "step 3\n",
      "got 7 at resolution 1.3125\n",
      "step 4\n",
      "got 8 at resolution 1.40625\n",
      "GeneScoring_pca\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 15 at resolution 1.5\n",
      "step 1\n",
      "got 9 at resolution 0.75\n",
      "step 2\n",
      "got 7 at resolution 0.375\n",
      "step 3\n",
      "got 7 at resolution 0.5625\n",
      "step 4\n",
      "got 7 at resolution 0.65625\n",
      "step 5\n",
      "got 7 at resolution 0.703125\n",
      "step 6\n",
      "got 9 at resolution 0.7265625\n",
      "step 7\n",
      "got 7 at resolution 0.71484375\n",
      "step 8\n",
      "got 8 at resolution 0.720703125\n",
      "Cicero_pca\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 12 at resolution 1.5\n",
      "step 1\n",
      "got 5 at resolution 0.75\n",
      "step 2\n",
      "got 11 at resolution 1.125\n",
      "step 3\n",
      "got 7 at resolution 0.9375\n",
      "step 4\n",
      "got 10 at resolution 1.03125\n",
      "step 5\n",
      "got 7 at resolution 0.984375\n",
      "step 6\n",
      "got 9 at resolution 1.0078125\n",
      "step 7\n",
      "got 8 at resolution 0.99609375\n",
      "SCRAT_pca\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
      "  res = PandasDataFrame.from_items(items)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0\n",
      "got 23 at resolution 1.5\n",
      "step 1\n",
      "got 16 at resolution 0.75\n",
      "step 2\n",
      "got 11 at resolution 0.375\n",
      "step 3\n",
      "got 6 at resolution 0.1875\n",
      "step 4\n",
      "got 9 at resolution 0.28125\n",
      "step 5\n",
      "got 7 at resolution 0.234375\n",
      "step 6\n",
      "got 8 at resolution 0.2578125\n"
     ]
    }
   ],
   "source": [
    "for file in files:\n",
    "    file_split = file.split('_')\n",
    "    method = file_split[1]\n",
    "    dataset = file_split[2].split('.')[0]\n",
    "    if(len(file_split)>3):\n",
    "        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]\n",
    "    print(method)\n",
    "\n",
    "    pandas2ri.activate()\n",
    "    readRDS = robjects.r['readRDS']\n",
    "    df_rds = readRDS(os.path.join(path_fm,file))\n",
    "    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))\n",
    "    fm_mat.fillna(0,inplace=True)\n",
    "    fm_mat.columns = metadata.index\n",
    "    \n",
    "    adata = sc.AnnData(fm_mat.T)\n",
    "    adata.var_names_make_unique()\n",
    "    adata.obs = metadata.loc[adata.obs.index,]\n",
    "#     df_metrics.loc[method,] = \"\"\n",
    "    #Louvain\n",
    "    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')\n",
    "#     sc.tl.louvain(adata)\n",
    "    getNClusters(adata,n_cluster=num_clusters)\n",
    "    #kmeans\n",
    "    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)\n",
    "    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')\n",
    "    #hierachical clustering\n",
    "    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)\n",
    "    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')\n",
    "#     #clustering metrics\n",
    "    \n",
    "#     #adjusted rank index\n",
    "#     ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])\n",
    "#     ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])\n",
    "#     ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])\n",
    "#     #adjusted mutual information\n",
    "#     ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')\n",
    "#     ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   \n",
    "#     ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')\n",
    "#     #homogeneity\n",
    "#     homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])\n",
    "#     homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])\n",
    "#     homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])\n",
    "\n",
    "#     df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]\n",
    "#     df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]\n",
    "#     df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] \n",
    "    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def residual_average_gini_index(gene_scores,folder_clusters,\n",
    "                                housekeeping_genes,marker_genes,\n",
    "                                min_cells_per_cluster=10):\n",
    "\n",
    "    #Subset from the main matrix the housekeeping genes and marker genes\n",
    "    df_matrix_housekeeping=gene_scores.loc[gene_scores.index.intersection(housekeeping_genes),]\n",
    "    df_matrix_marker=gene_scores.loc[gene_scores.index.intersection(marker_genes),]\n",
    "    \n",
    "    #Define a function to compute the Gini score\n",
    "    def gini(list_of_values):\n",
    "        sorted_list = sorted(list_of_values)\n",
    "        height, area = 0, 0\n",
    "        for value in sorted_list:\n",
    "            height += value\n",
    "            area += height - value / 2.\n",
    "            fair_area = height * len(list_of_values) / 2.\n",
    "        return (fair_area - area) / fair_area\n",
    "    \n",
    "    #Function to calculate Gini value for all the genes\n",
    "    def calculate_gini(df_matrix, gene_name,clustering_info):\n",
    "        return gini(get_avg_per_cluster(df_matrix,gene_name,clustering_info,use_log2=False))\n",
    "\n",
    "    #Function to calculate Gini value for all the genes\n",
    "    def calculate_gini_values(df_matrix,clustering_info):\n",
    "        gini_values=[]\n",
    "        for gene_name in df_matrix.index:\n",
    "            gini_values.append(calculate_gini(df_matrix, gene_name,clustering_info))\n",
    "        return gini_values\n",
    "    \n",
    "    #Write a function to compute delta difference of the average accessibility in Marker vs Housekeeping and Kolmogorov Smirnov test\n",
    "    def score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info):\n",
    "        gini_values_housekeeping=calculate_gini_values(df_matrix_housekeeping,clustering_info)\n",
    "        gini_values_marker=calculate_gini_values(df_matrix_marker,clustering_info)\n",
    "        statistic,p_value=stats.ks_2samp(gini_values_marker,gini_values_housekeeping)\n",
    "        \n",
    "        return  np.mean(gini_values_marker), np.mean(gini_values_housekeeping),np.mean(gini_values_marker)-np.mean(gini_values_housekeeping), statistic,p_value\n",
    "\n",
    "    #Function to compute the average accessibility value per cluster\n",
    "    def get_avg_per_cluster(df_matrix, gene_name, clustering_info,use_log2=False):\n",
    "        N_clusters=len(clustering_info.index.unique())\n",
    "        avg_per_cluster=np.zeros(N_clusters)\n",
    "        for idx,idx_cluster in enumerate(sorted(np.unique(clustering_info.index.unique()))):\n",
    "            if use_log2:\n",
    "                values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()].apply(lambda x:np.log2(x+1))\n",
    "            else:\n",
    "                values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()]\n",
    "           \n",
    "            avg_per_cluster[idx]=values_cluster.mean()\n",
    "            if avg_per_cluster[idx]>0:\n",
    "                  avg_per_cluster[idx]=avg_per_cluster[idx]#/values_cluster.std()\n",
    "\n",
    "        return avg_per_cluster\n",
    "    \n",
    "\n",
    "    #Run the method for all the clustering solutions\n",
    "    \n",
    "    df_metrics = pd.DataFrame(columns=['Method','Clustering','Gini_Marker_Genes','Gini_Housekeeping_Genes','Difference','KS_statistics','p-value'])    \n",
    "    \n",
    "    for clusters_filename in os.listdir(folder_clusters):     \n",
    "        method = '_'.join(clusters_filename.split('_')[:-1])\n",
    "        print(method)\n",
    "        df_clusters = pd.read_csv(os.path.join(folder_clusters,clusters_filename),sep='\\t',index_col=0)\n",
    "        for clustering_method in df_clusters.columns:\n",
    "            clustering_info = pd.DataFrame(df_clusters[clustering_method])\n",
    "            clustering_info['Barcode'] = clustering_info.index\n",
    "            clustering_info=clustering_info.set_index(clustering_method)\n",
    "            \n",
    "            #REMOVE CLUSTERS WITH FEW CELLS\n",
    "            cluster_sizes=pd.value_counts(clustering_info.index)\n",
    "            clustering_info=clustering_info.loc[cluster_sizes[cluster_sizes>min_cells_per_cluster].index.values,:]\n",
    "\n",
    "\n",
    "            mean_gini_marker,mean_gini_housekeeping,mean_gini_difference,statistics,p_value=score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info)\n",
    "\n",
    "            df_metrics = df_metrics.append({'Method': method,'Clustering':clustering_method,\n",
    "                               'Gini_Marker_Genes':mean_gini_marker,'Gini_Housekeeping_Genes':mean_gini_housekeeping,\n",
    "                               'Difference':mean_gini_difference,'KS_statistics':statistics,'p-value':p_value},\n",
    "                              ignore_index=True)  \n",
    "        \n",
    "    return df_metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_scores = pd.read_csv('./run_methods/GeneScoring/FM_GeneScoring_10xpbmc5k.tsv',\n",
    "                         sep = '\\t',index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AAACGAAAGCGCAATG-1</th>\n",
       "      <th>AAACGAAAGGGTATCG-1</th>\n",
       "      <th>AAACGAAAGTAACATG-1</th>\n",
       "      <th>AAACGAAAGTTACACC-1</th>\n",
       "      <th>AAACGAACAGAGATGC-1</th>\n",
       "      <th>AAACGAACATGCTATG-1</th>\n",
       "      <th>AAACGAAGTGCATCAT-1</th>\n",
       "      <th>AAACGAAGTGGACGAT-1</th>\n",
       "      <th>AAACGAAGTGGCCTCA-1</th>\n",
       "      <th>AAACGAATCAGTGTAC-1</th>\n",
       "      <th>...</th>\n",
       "      <th>TTTGGTTGTCAGAAAT-1</th>\n",
       "      <th>TTTGGTTGTTGTATCG-1</th>\n",
       "      <th>TTTGGTTTCAGTGGTT-1</th>\n",
       "      <th>TTTGTGTAGGAAACTT-1</th>\n",
       "      <th>TTTGTGTCAAGCCTTA-1</th>\n",
       "      <th>TTTGTGTCACTCAAGT-1</th>\n",
       "      <th>TTTGTGTCACTGGGCT-1</th>\n",
       "      <th>TTTGTGTGTACGCAAG-1</th>\n",
       "      <th>TTTGTGTGTCTGCGCA-1</th>\n",
       "      <th>TTTGTGTTCAACTTGG-1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1BG</th>\n",
       "      <td>0.086201</td>\n",
       "      <td>1.775694</td>\n",
       "      <td>3.591624</td>\n",
       "      <td>0.212587</td>\n",
       "      <td>0.040847</td>\n",
       "      <td>0.004313</td>\n",
       "      <td>0.002157</td>\n",
       "      <td>0.176512</td>\n",
       "      <td>0.000518</td>\n",
       "      <td>0.007131</td>\n",
       "      <td>...</td>\n",
       "      <td>0.041391</td>\n",
       "      <td>3.588490</td>\n",
       "      <td>1.822400</td>\n",
       "      <td>0.045678</td>\n",
       "      <td>5.371366</td>\n",
       "      <td>0.002675</td>\n",
       "      <td>2.454377</td>\n",
       "      <td>1.984945</td>\n",
       "      <td>1.732547</td>\n",
       "      <td>1.730391</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A1BG-AS1</th>\n",
       "      <td>0.458201</td>\n",
       "      <td>0.478351</td>\n",
       "      <td>1.365454</td>\n",
       "      <td>0.930858</td>\n",
       "      <td>0.018763</td>\n",
       "      <td>0.023040</td>\n",
       "      <td>0.011713</td>\n",
       "      <td>0.943315</td>\n",
       "      <td>0.000097</td>\n",
       "      <td>0.038674</td>\n",
       "      <td>...</td>\n",
       "      <td>0.011158</td>\n",
       "      <td>1.345947</td>\n",
       "      <td>0.922146</td>\n",
       "      <td>0.042287</td>\n",
       "      <td>2.255818</td>\n",
       "      <td>0.012003</td>\n",
       "      <td>3.702715</td>\n",
       "      <td>1.393866</td>\n",
       "      <td>0.444438</td>\n",
       "      <td>0.432918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2LD1</th>\n",
       "      <td>0.000363</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2M</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.054960</td>\n",
       "      <td>0.000665</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.205809</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.007216</td>\n",
       "      <td>2.258926</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000819</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.001228</td>\n",
       "      <td>0.000665</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2M-AS1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.051265</td>\n",
       "      <td>0.000621</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.057506</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.006731</td>\n",
       "      <td>2.107052</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000764</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.001146</td>\n",
       "      <td>0.000621</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5335 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          AAACGAAAGCGCAATG-1  AAACGAAAGGGTATCG-1  AAACGAAAGTAACATG-1  \\\n",
       "A1BG                0.086201            1.775694            3.591624   \n",
       "A1BG-AS1            0.458201            0.478351            1.365454   \n",
       "A2LD1               0.000363            0.000000            0.000000   \n",
       "A2M                 0.000000            0.054960            0.000665   \n",
       "A2M-AS1             0.000000            0.051265            0.000621   \n",
       "\n",
       "          AAACGAAAGTTACACC-1  AAACGAACAGAGATGC-1  AAACGAACATGCTATG-1  \\\n",
       "A1BG                0.212587            0.040847            0.004313   \n",
       "A1BG-AS1            0.930858            0.018763            0.023040   \n",
       "A2LD1               0.000000            0.000000            0.000000   \n",
       "A2M                 0.000000            2.205809            0.000000   \n",
       "A2M-AS1             0.000000            2.057506            0.000000   \n",
       "\n",
       "          AAACGAAGTGCATCAT-1  AAACGAAGTGGACGAT-1  AAACGAAGTGGCCTCA-1  \\\n",
       "A1BG                0.002157            0.176512            0.000518   \n",
       "A1BG-AS1            0.011713            0.943315            0.000097   \n",
       "A2LD1               0.000000            0.000000            0.000000   \n",
       "A2M                 0.000000            0.000000            0.000000   \n",
       "A2M-AS1             0.000000            0.000000            0.000000   \n",
       "\n",
       "          AAACGAATCAGTGTAC-1  ...  TTTGGTTGTCAGAAAT-1  TTTGGTTGTTGTATCG-1  \\\n",
       "A1BG                0.007131  ...            0.041391            3.588490   \n",
       "A1BG-AS1            0.038674  ...            0.011158            1.345947   \n",
       "A2LD1               0.000000  ...            0.000000            0.000000   \n",
       "A2M                 0.000000  ...            0.000000            0.007216   \n",
       "A2M-AS1             0.000000  ...            0.000000            0.006731   \n",
       "\n",
       "          TTTGGTTTCAGTGGTT-1  TTTGTGTAGGAAACTT-1  TTTGTGTCAAGCCTTA-1  \\\n",
       "A1BG                1.822400            0.045678            5.371366   \n",
       "A1BG-AS1            0.922146            0.042287            2.255818   \n",
       "A2LD1               0.000000            0.000000            0.000000   \n",
       "A2M                 2.258926            0.000000            0.000819   \n",
       "A2M-AS1             2.107052            0.000000            0.000764   \n",
       "\n",
       "          TTTGTGTCACTCAAGT-1  TTTGTGTCACTGGGCT-1  TTTGTGTGTACGCAAG-1  \\\n",
       "A1BG                0.002675            2.454377            1.984945   \n",
       "A1BG-AS1            0.012003            3.702715            1.393866   \n",
       "A2LD1               0.000000            0.000000            0.000000   \n",
       "A2M                 0.000000            0.001228            0.000665   \n",
       "A2M-AS1             0.000000            0.001146            0.000621   \n",
       "\n",
       "          TTTGTGTGTCTGCGCA-1  TTTGTGTTCAACTTGG-1  \n",
       "A1BG                1.732547            1.730391  \n",
       "A1BG-AS1            0.444438            0.432918  \n",
       "A2LD1               0.000000            0.000000  \n",
       "A2M                 0.000000            0.000000  \n",
       "A2M-AS1             0.000000            0.000000  \n",
       "\n",
       "[5 rows x 5335 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_scores.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://www.tau.ac.il/~elieis/Housekeeping_genes.html List of Housekeeping genes\n",
    "housekeeping_genes=['ACTB','ALDOA','GAPDH','PGK1','LDHA','RPS27A','RPL19','RPL11','NONO','ARHGDIA','RPL32','RPS18','HSPCB',\n",
    "    'C1orf43','CHMP2A','EMC7','GPI','PSMB2,''PSMB4','RAB7A','REEP5','SNRPD3','VCP','VPS29']\n",
    "\n",
    "#List of Marker Genes\n",
    "marker_genes=['CD209', 'ENG', 'FOXP3', 'CD34', 'BATF3', 'S100A12', 'THBD','CD3D', 'THY1', 'CD8A', 'CD8B', 'CD14', 'PROM1', 'IL2RA', 'FCGR3A',\n",
    "'IL3RA', 'FCGR1A', 'CD19', 'IL7R', 'CD79A', 'MS4A1', 'NCAM1','CD3E', 'CD3G', 'KIT', 'CD1C', 'CD68', 'CD4']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_clusters = './output/clusters/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChromVAR_motifs\n",
      "ChromVAR_kmers\n",
      "cisTopic\n",
      "SnapATAC\n",
      "BROCKMAN\n",
      "SCRAT_motifs\n",
      "Cusanovich2018\n",
      "Control\n",
      "GeneScoring\n",
      "Scasat\n",
      "scABC\n",
      "Cicero\n",
      "ChromVAR_kmers_pca\n",
      "ChromVAR_motifs_pca\n",
      "GeneScoring_pca\n",
      "Cicero_pca\n",
      "SCRAT_pca\n"
     ]
    }
   ],
   "source": [
    "df_metrics = residual_average_gini_index(gene_scores,folder_clusters,\n",
    "                                housekeeping_genes,marker_genes,\n",
    "                                min_cells_per_cluster=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_metrics.to_csv(path_metrics+'clustering_RAGI_scores.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Method</th>\n",
       "      <th>Clustering</th>\n",
       "      <th>Gini_Marker_Genes</th>\n",
       "      <th>Gini_Housekeeping_Genes</th>\n",
       "      <th>Difference</th>\n",
       "      <th>KS_statistics</th>\n",
       "      <th>p-value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ChromVAR_motifs</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.329794</td>\n",
       "      <td>0.202917</td>\n",
       "      <td>0.126878</td>\n",
       "      <td>0.535714</td>\n",
       "      <td>0.001101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ChromVAR_motifs</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.343745</td>\n",
       "      <td>0.237399</td>\n",
       "      <td>0.106345</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ChromVAR_motifs</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.381168</td>\n",
       "      <td>0.290041</td>\n",
       "      <td>0.091127</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.002897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ChromVAR_kmers</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.309164</td>\n",
       "      <td>0.178850</td>\n",
       "      <td>0.130315</td>\n",
       "      <td>0.535714</td>\n",
       "      <td>0.001101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ChromVAR_kmers</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.340088</td>\n",
       "      <td>0.263170</td>\n",
       "      <td>0.076918</td>\n",
       "      <td>0.488095</td>\n",
       "      <td>0.003940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>ChromVAR_kmers</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.319202</td>\n",
       "      <td>0.241089</td>\n",
       "      <td>0.078113</td>\n",
       "      <td>0.523810</td>\n",
       "      <td>0.001532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>cisTopic</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.369119</td>\n",
       "      <td>0.254688</td>\n",
       "      <td>0.114431</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>cisTopic</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.417319</td>\n",
       "      <td>0.316060</td>\n",
       "      <td>0.101259</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.002897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>cisTopic</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.468534</td>\n",
       "      <td>0.362493</td>\n",
       "      <td>0.106041</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.002897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>SnapATAC</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.332478</td>\n",
       "      <td>0.195798</td>\n",
       "      <td>0.136680</td>\n",
       "      <td>0.452381</td>\n",
       "      <td>0.009484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>SnapATAC</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.361506</td>\n",
       "      <td>0.217868</td>\n",
       "      <td>0.143637</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>SnapATAC</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.371438</td>\n",
       "      <td>0.231429</td>\n",
       "      <td>0.140009</td>\n",
       "      <td>0.523810</td>\n",
       "      <td>0.001532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>BROCKMAN</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.346999</td>\n",
       "      <td>0.234125</td>\n",
       "      <td>0.112874</td>\n",
       "      <td>0.476190</td>\n",
       "      <td>0.005320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>BROCKMAN</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.327789</td>\n",
       "      <td>0.262820</td>\n",
       "      <td>0.064969</td>\n",
       "      <td>0.392857</td>\n",
       "      <td>0.035346</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>BROCKMAN</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.324061</td>\n",
       "      <td>0.255878</td>\n",
       "      <td>0.068183</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>SCRAT_motifs</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.347878</td>\n",
       "      <td>0.293645</td>\n",
       "      <td>0.054233</td>\n",
       "      <td>0.476190</td>\n",
       "      <td>0.005320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>SCRAT_motifs</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.287355</td>\n",
       "      <td>0.234507</td>\n",
       "      <td>0.052848</td>\n",
       "      <td>0.392857</td>\n",
       "      <td>0.035346</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>SCRAT_motifs</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.315229</td>\n",
       "      <td>0.263967</td>\n",
       "      <td>0.051262</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>0.021353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Cusanovich2018</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.388897</td>\n",
       "      <td>0.280957</td>\n",
       "      <td>0.107940</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Cusanovich2018</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.254027</td>\n",
       "      <td>0.177305</td>\n",
       "      <td>0.076722</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Cusanovich2018</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.225468</td>\n",
       "      <td>0.141125</td>\n",
       "      <td>0.084343</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Control</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.443971</td>\n",
       "      <td>0.358864</td>\n",
       "      <td>0.085107</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Control</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.456480</td>\n",
       "      <td>0.453863</td>\n",
       "      <td>0.002617</td>\n",
       "      <td>0.273810</td>\n",
       "      <td>0.280812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Control</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.471765</td>\n",
       "      <td>0.467423</td>\n",
       "      <td>0.004342</td>\n",
       "      <td>0.238095</td>\n",
       "      <td>0.448886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>GeneScoring</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.460154</td>\n",
       "      <td>0.389402</td>\n",
       "      <td>0.070752</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.016413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>GeneScoring</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.450160</td>\n",
       "      <td>0.471894</td>\n",
       "      <td>-0.021734</td>\n",
       "      <td>0.297619</td>\n",
       "      <td>0.197109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>GeneScoring</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.503348</td>\n",
       "      <td>0.501473</td>\n",
       "      <td>0.001875</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.236193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Scasat</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.472445</td>\n",
       "      <td>0.374599</td>\n",
       "      <td>0.097846</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.002897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>Scasat</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.497129</td>\n",
       "      <td>0.421512</td>\n",
       "      <td>0.075617</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.016413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Scasat</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.477655</td>\n",
       "      <td>0.399098</td>\n",
       "      <td>0.078557</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>scABC</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.461698</td>\n",
       "      <td>0.441462</td>\n",
       "      <td>0.020237</td>\n",
       "      <td>0.297619</td>\n",
       "      <td>0.197109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>scABC</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.458673</td>\n",
       "      <td>0.454737</td>\n",
       "      <td>0.003937</td>\n",
       "      <td>0.238095</td>\n",
       "      <td>0.448886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>scABC</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.425721</td>\n",
       "      <td>0.396330</td>\n",
       "      <td>0.029391</td>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.134155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Cicero</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.354993</td>\n",
       "      <td>0.249562</td>\n",
       "      <td>0.105431</td>\n",
       "      <td>0.547619</td>\n",
       "      <td>0.000786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>Cicero</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>Cicero</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.264151</td>\n",
       "      <td>0.203767</td>\n",
       "      <td>0.060383</td>\n",
       "      <td>0.357143</td>\n",
       "      <td>0.071204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>ChromVAR_kmers_pca</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.352224</td>\n",
       "      <td>0.232139</td>\n",
       "      <td>0.120086</td>\n",
       "      <td>0.511905</td>\n",
       "      <td>0.002114</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>ChromVAR_kmers_pca</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.287473</td>\n",
       "      <td>0.227681</td>\n",
       "      <td>0.059792</td>\n",
       "      <td>0.488095</td>\n",
       "      <td>0.003940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>ChromVAR_kmers_pca</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.322012</td>\n",
       "      <td>0.246821</td>\n",
       "      <td>0.075191</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>ChromVAR_motifs_pca</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.333697</td>\n",
       "      <td>0.205199</td>\n",
       "      <td>0.128498</td>\n",
       "      <td>0.535714</td>\n",
       "      <td>0.001101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>ChromVAR_motifs_pca</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.396433</td>\n",
       "      <td>0.301102</td>\n",
       "      <td>0.095331</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.016413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>ChromVAR_motifs_pca</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.397484</td>\n",
       "      <td>0.301730</td>\n",
       "      <td>0.095755</td>\n",
       "      <td>0.440476</td>\n",
       "      <td>0.012522</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>GeneScoring_pca</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.517697</td>\n",
       "      <td>0.503202</td>\n",
       "      <td>0.014495</td>\n",
       "      <td>0.261905</td>\n",
       "      <td>0.331171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>GeneScoring_pca</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.488294</td>\n",
       "      <td>0.499929</td>\n",
       "      <td>-0.011635</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.109433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>GeneScoring_pca</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.469333</td>\n",
       "      <td>0.473566</td>\n",
       "      <td>-0.004233</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.236193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>Cicero_pca</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.470227</td>\n",
       "      <td>0.356027</td>\n",
       "      <td>0.114200</td>\n",
       "      <td>0.488095</td>\n",
       "      <td>0.003940</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>Cicero_pca</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.262221</td>\n",
       "      <td>0.197641</td>\n",
       "      <td>0.064580</td>\n",
       "      <td>0.357143</td>\n",
       "      <td>0.071204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Cicero_pca</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.250496</td>\n",
       "      <td>0.172051</td>\n",
       "      <td>0.078446</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>SCRAT_pca</td>\n",
       "      <td>louvain</td>\n",
       "      <td>0.392071</td>\n",
       "      <td>0.320236</td>\n",
       "      <td>0.071835</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.007129</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>SCRAT_pca</td>\n",
       "      <td>kmeans</td>\n",
       "      <td>0.287531</td>\n",
       "      <td>0.234164</td>\n",
       "      <td>0.053367</td>\n",
       "      <td>0.392857</td>\n",
       "      <td>0.035346</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>SCRAT_pca</td>\n",
       "      <td>hc</td>\n",
       "      <td>0.278123</td>\n",
       "      <td>0.232857</td>\n",
       "      <td>0.045266</td>\n",
       "      <td>0.404762</td>\n",
       "      <td>0.027574</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 Method Clustering  Gini_Marker_Genes  \\\n",
       "0       ChromVAR_motifs    louvain           0.329794   \n",
       "1       ChromVAR_motifs     kmeans           0.343745   \n",
       "2       ChromVAR_motifs         hc           0.381168   \n",
       "3        ChromVAR_kmers    louvain           0.309164   \n",
       "4        ChromVAR_kmers     kmeans           0.340088   \n",
       "5        ChromVAR_kmers         hc           0.319202   \n",
       "6              cisTopic    louvain           0.369119   \n",
       "7              cisTopic     kmeans           0.417319   \n",
       "8              cisTopic         hc           0.468534   \n",
       "9              SnapATAC    louvain           0.332478   \n",
       "10             SnapATAC     kmeans           0.361506   \n",
       "11             SnapATAC         hc           0.371438   \n",
       "12             BROCKMAN    louvain           0.346999   \n",
       "13             BROCKMAN     kmeans           0.327789   \n",
       "14             BROCKMAN         hc           0.324061   \n",
       "15         SCRAT_motifs    louvain           0.347878   \n",
       "16         SCRAT_motifs     kmeans           0.287355   \n",
       "17         SCRAT_motifs         hc           0.315229   \n",
       "18       Cusanovich2018    louvain           0.388897   \n",
       "19       Cusanovich2018     kmeans           0.254027   \n",
       "20       Cusanovich2018         hc           0.225468   \n",
       "21              Control    louvain           0.443971   \n",
       "22              Control     kmeans           0.456480   \n",
       "23              Control         hc           0.471765   \n",
       "24          GeneScoring    louvain           0.460154   \n",
       "25          GeneScoring     kmeans           0.450160   \n",
       "26          GeneScoring         hc           0.503348   \n",
       "27               Scasat    louvain           0.472445   \n",
       "28               Scasat     kmeans           0.497129   \n",
       "29               Scasat         hc           0.477655   \n",
       "30                scABC    louvain           0.461698   \n",
       "31                scABC     kmeans           0.458673   \n",
       "32                scABC         hc           0.425721   \n",
       "33               Cicero    louvain           0.354993   \n",
       "34               Cicero     kmeans           0.000000   \n",
       "35               Cicero         hc           0.264151   \n",
       "36   ChromVAR_kmers_pca    louvain           0.352224   \n",
       "37   ChromVAR_kmers_pca     kmeans           0.287473   \n",
       "38   ChromVAR_kmers_pca         hc           0.322012   \n",
       "39  ChromVAR_motifs_pca    louvain           0.333697   \n",
       "40  ChromVAR_motifs_pca     kmeans           0.396433   \n",
       "41  ChromVAR_motifs_pca         hc           0.397484   \n",
       "42      GeneScoring_pca    louvain           0.517697   \n",
       "43      GeneScoring_pca     kmeans           0.488294   \n",
       "44      GeneScoring_pca         hc           0.469333   \n",
       "45           Cicero_pca    louvain           0.470227   \n",
       "46           Cicero_pca     kmeans           0.262221   \n",
       "47           Cicero_pca         hc           0.250496   \n",
       "48            SCRAT_pca    louvain           0.392071   \n",
       "49            SCRAT_pca     kmeans           0.287531   \n",
       "50            SCRAT_pca         hc           0.278123   \n",
       "\n",
       "    Gini_Housekeeping_Genes  Difference  KS_statistics   p-value  \n",
       "0                  0.202917    0.126878       0.535714  0.001101  \n",
       "1                  0.237399    0.106345       0.464286  0.007129  \n",
       "2                  0.290041    0.091127       0.500000  0.002897  \n",
       "3                  0.178850    0.130315       0.535714  0.001101  \n",
       "4                  0.263170    0.076918       0.488095  0.003940  \n",
       "5                  0.241089    0.078113       0.523810  0.001532  \n",
       "6                  0.254688    0.114431       0.571429  0.000392  \n",
       "7                  0.316060    0.101259       0.500000  0.002897  \n",
       "8                  0.362493    0.106041       0.500000  0.002897  \n",
       "9                  0.195798    0.136680       0.452381  0.009484  \n",
       "10                 0.217868    0.143637       0.571429  0.000392  \n",
       "11                 0.231429    0.140009       0.523810  0.001532  \n",
       "12                 0.234125    0.112874       0.476190  0.005320  \n",
       "13                 0.262820    0.064969       0.392857  0.035346  \n",
       "14                 0.255878    0.068183       0.464286  0.007129  \n",
       "15                 0.293645    0.054233       0.476190  0.005320  \n",
       "16                 0.234507    0.052848       0.392857  0.035346  \n",
       "17                 0.263967    0.051262       0.416667  0.021353  \n",
       "18                 0.280957    0.107940       0.464286  0.007129  \n",
       "19                 0.177305    0.076722       0.464286  0.007129  \n",
       "20                 0.141125    0.084343       0.464286  0.007129  \n",
       "21                 0.358864    0.085107       0.464286  0.007129  \n",
       "22                 0.453863    0.002617       0.273810  0.280812  \n",
       "23                 0.467423    0.004342       0.238095  0.448886  \n",
       "24                 0.389402    0.070752       0.428571  0.016413  \n",
       "25                 0.471894   -0.021734       0.297619  0.197109  \n",
       "26                 0.501473    0.001875       0.285714  0.236193  \n",
       "27                 0.374599    0.097846       0.500000  0.002897  \n",
       "28                 0.421512    0.075617       0.428571  0.016413  \n",
       "29                 0.399098    0.078557       0.464286  0.007129  \n",
       "30                 0.441462    0.020237       0.297619  0.197109  \n",
       "31                 0.454737    0.003937       0.238095  0.448886  \n",
       "32                 0.396330    0.029391       0.321429  0.134155  \n",
       "33                 0.249562    0.105431       0.547619  0.000786  \n",
       "34                 0.000000    0.000000       0.000000  1.000000  \n",
       "35                 0.203767    0.060383       0.357143  0.071204  \n",
       "36                 0.232139    0.120086       0.511905  0.002114  \n",
       "37                 0.227681    0.059792       0.488095  0.003940  \n",
       "38                 0.246821    0.075191       0.464286  0.007129  \n",
       "39                 0.205199    0.128498       0.535714  0.001101  \n",
       "40                 0.301102    0.095331       0.428571  0.016413  \n",
       "41                 0.301730    0.095755       0.440476  0.012522  \n",
       "42                 0.503202    0.014495       0.261905  0.331171  \n",
       "43                 0.499929   -0.011635       0.333333  0.109433  \n",
       "44                 0.473566   -0.004233       0.285714  0.236193  \n",
       "45                 0.356027    0.114200       0.488095  0.003940  \n",
       "46                 0.197641    0.064580       0.357143  0.071204  \n",
       "47                 0.172051    0.078446       0.464286  0.007129  \n",
       "48                 0.320236    0.071835       0.464286  0.007129  \n",
       "49                 0.234164    0.053367       0.392857  0.035346  \n",
       "50                 0.232857    0.045266       0.404762  0.027574  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:ATACseq_clustering]",
   "language": "python",
   "name": "conda-env-ATACseq_clustering-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}