{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scanpy as sc\n", "import os\n", "from sklearn.cluster import KMeans\n", "from sklearn.cluster import AgglomerativeClustering\n", "from sklearn.metrics.cluster import adjusted_rand_score\n", "from sklearn.metrics.cluster import adjusted_mutual_info_score\n", "from sklearn.metrics.cluster import homogeneity_score\n", "import rpy2.robjects as robjects\n", "from rpy2.robjects import pandas2ri" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n", "num_clusters = len(np.unique(metadata['label']))\n", "print(num_clusters)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_metrics = pd.DataFrame(columns=['ARI','AMI','Homogeneity'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./SnapATAC/clusteringSolution.tsv\n", "./Cusanovich2018/clusteringSolution.tsv\n", "./scABC/clusteringSolution.tsv\n", "./cisTopic/clusteringSolution.tsv\n", "./Scasat/clusteringSolution.tsv\n", "./Cicero/clusteringSolution.tsv\n" ] } ], "source": [ "df_clusters = pd.DataFrame(index=metadata.index)\n", "for dirpath, dirnames, filenames in os.walk(\"./\"):\n", " for filename in [f for f in filenames if(f.endswith(\".tsv\") and f.startswith(\"clustering\"))]:\n", " print(os.path.join(dirpath, filename))\n", " df = pd.read_csv(os.path.join(dirpath, filename),sep='\\t',index_col=0)\n", " df_clusters = pd.merge(df_clusters, df, left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SnapATACcusanovich2018scABCcisTopicScasatCicero
BM1077-CLP-Frozen-160106-13466114
BM1077-CLP-Frozen-160106-14466119
BM1077-CLP-Frozen-160106-2466113
BM1077-CLP-Frozen-160106-21466114
BM1077-CLP-Frozen-160106-27169214
\n", "
" ], "text/plain": [ " SnapATAC cusanovich2018 scABC cisTopic \\\n", "BM1077-CLP-Frozen-160106-13 4 6 6 1 \n", "BM1077-CLP-Frozen-160106-14 4 6 6 1 \n", "BM1077-CLP-Frozen-160106-2 4 6 6 1 \n", "BM1077-CLP-Frozen-160106-21 4 6 6 1 \n", "BM1077-CLP-Frozen-160106-27 1 6 9 2 \n", "\n", " Scasat Cicero \n", "BM1077-CLP-Frozen-160106-13 1 4 \n", "BM1077-CLP-Frozen-160106-14 1 9 \n", "BM1077-CLP-Frozen-160106-2 1 3 \n", "BM1077-CLP-Frozen-160106-21 1 4 \n", "BM1077-CLP-Frozen-160106-27 1 4 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_clusters.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SnapATAC\n", "cusanovich2018\n", "scABC\n", "cisTopic\n", "Scasat\n", "Cicero\n" ] } ], "source": [ "for method in df_clusters.columns:\n", " print(method)\n", "\n", " #adjusted rank index\n", " ari = adjusted_rand_score(metadata['label'], df_clusters[method])\n", "\n", " #adjusted mutual information\n", " ami = adjusted_mutual_info_score(metadata['label'], df_clusters[method],average_method='arithmetic')\n", " \n", " #homogeneity\n", " homo = homogeneity_score(metadata['label'], df_clusters[method])\n", "\n", " df_metrics.loc[method,'ARI'] = ari\n", " df_metrics.loc[method,'AMI'] = ami\n", " df_metrics.loc[method,'Homogeneity'] = homo" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ARIAMIHomogeneity
SnapATAC0.3239420.5870340.559376
cusanovich20180.483620.6623290.68703
scABC0.2702140.4648730.446248
cisTopic0.517010.6612360.682697
Scasat0.1115760.3248150.328444
Cicero0.222720.3497260.352056
\n", "
" ], "text/plain": [ " ARI AMI Homogeneity\n", "SnapATAC 0.323942 0.587034 0.559376\n", "cusanovich2018 0.48362 0.662329 0.68703\n", "scABC 0.270214 0.464873 0.446248\n", "cisTopic 0.51701 0.661236 0.682697\n", "Scasat 0.111576 0.324815 0.328444\n", "Cicero 0.22272 0.349726 0.352056" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_metrics" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_metrics.to_csv('./clustering_scores.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:ATACseq_clustering]", "language": "python", "name": "conda-env-ATACseq_clustering-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }