{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scanpy as sc\n",
    "import os\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "from sklearn.metrics.cluster import adjusted_rand_score\n",
    "from sklearn.metrics.cluster import adjusted_mutual_info_score\n",
    "from sklearn.metrics.cluster import homogeneity_score\n",
    "import rpy2.robjects as robjects\n",
    "from rpy2.robjects import pandas2ri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n"
     ]
    }
   ],
   "source": [
    "metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n",
    "num_clusters = len(np.unique(metadata['label']))\n",
    "print(num_clusters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_metrics = pd.DataFrame(columns=['ARI','AMI','Homogeneity'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./SnapATAC/clusteringSolution.tsv\n",
      "./Cusanovich2018/clusteringSolution.tsv\n",
      "./scABC/clusteringSolution.tsv\n",
      "./cisTopic/clusteringSolution.tsv\n",
      "./Scasat/clusteringSolution.tsv\n",
      "./Cicero/clusteringSolution.tsv\n"
     ]
    }
   ],
   "source": [
    "df_clusters = pd.DataFrame(index=metadata.index)\n",
    "for dirpath, dirnames, filenames in os.walk(\"./\"):\n",
    "    for filename in [f for f in filenames if(f.endswith(\".tsv\") and f.startswith(\"clustering\"))]:\n",
    "        print(os.path.join(dirpath, filename))\n",
    "        df = pd.read_csv(os.path.join(dirpath, filename),sep='\\t',index_col=0)\n",
    "        df_clusters = pd.merge(df_clusters, df, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SnapATAC</th>\n",
       "      <th>cusanovich2018</th>\n",
       "      <th>scABC</th>\n",
       "      <th>cisTopic</th>\n",
       "      <th>Scasat</th>\n",
       "      <th>Cicero</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>BM1077-CLP-Frozen-160106-13</th>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BM1077-CLP-Frozen-160106-14</th>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BM1077-CLP-Frozen-160106-2</th>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BM1077-CLP-Frozen-160106-21</th>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BM1077-CLP-Frozen-160106-27</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             SnapATAC  cusanovich2018  scABC  cisTopic  \\\n",
       "BM1077-CLP-Frozen-160106-13         4               6      6         1   \n",
       "BM1077-CLP-Frozen-160106-14         4               6      6         1   \n",
       "BM1077-CLP-Frozen-160106-2          4               6      6         1   \n",
       "BM1077-CLP-Frozen-160106-21         4               6      6         1   \n",
       "BM1077-CLP-Frozen-160106-27         1               6      9         2   \n",
       "\n",
       "                             Scasat  Cicero  \n",
       "BM1077-CLP-Frozen-160106-13       1       4  \n",
       "BM1077-CLP-Frozen-160106-14       1       9  \n",
       "BM1077-CLP-Frozen-160106-2        1       3  \n",
       "BM1077-CLP-Frozen-160106-21       1       4  \n",
       "BM1077-CLP-Frozen-160106-27       1       4  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_clusters.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SnapATAC\n",
      "cusanovich2018\n",
      "scABC\n",
      "cisTopic\n",
      "Scasat\n",
      "Cicero\n"
     ]
    }
   ],
   "source": [
    "for method in df_clusters.columns:\n",
    "    print(method)\n",
    "\n",
    "    #adjusted rank index\n",
    "    ari = adjusted_rand_score(metadata['label'], df_clusters[method])\n",
    "\n",
    "    #adjusted mutual information\n",
    "    ami = adjusted_mutual_info_score(metadata['label'], df_clusters[method],average_method='arithmetic')\n",
    "    \n",
    "    #homogeneity\n",
    "    homo = homogeneity_score(metadata['label'], df_clusters[method])\n",
    "\n",
    "    df_metrics.loc[method,'ARI'] = ari\n",
    "    df_metrics.loc[method,'AMI'] = ami\n",
    "    df_metrics.loc[method,'Homogeneity'] = homo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ARI</th>\n",
       "      <th>AMI</th>\n",
       "      <th>Homogeneity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>SnapATAC</th>\n",
       "      <td>0.323942</td>\n",
       "      <td>0.587034</td>\n",
       "      <td>0.559376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cusanovich2018</th>\n",
       "      <td>0.48362</td>\n",
       "      <td>0.662329</td>\n",
       "      <td>0.68703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scABC</th>\n",
       "      <td>0.270214</td>\n",
       "      <td>0.464873</td>\n",
       "      <td>0.446248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cisTopic</th>\n",
       "      <td>0.51701</td>\n",
       "      <td>0.661236</td>\n",
       "      <td>0.682697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Scasat</th>\n",
       "      <td>0.111576</td>\n",
       "      <td>0.324815</td>\n",
       "      <td>0.328444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cicero</th>\n",
       "      <td>0.22272</td>\n",
       "      <td>0.349726</td>\n",
       "      <td>0.352056</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     ARI       AMI Homogeneity\n",
       "SnapATAC        0.323942  0.587034    0.559376\n",
       "cusanovich2018   0.48362  0.662329     0.68703\n",
       "scABC           0.270214  0.464873    0.446248\n",
       "cisTopic         0.51701  0.661236    0.682697\n",
       "Scasat          0.111576  0.324815    0.328444\n",
       "Cicero           0.22272  0.349726    0.352056"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_metrics.to_csv('./clustering_scores.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:ATACseq_clustering]",
   "language": "python",
   "name": "conda-env-ATACseq_clustering-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}