{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scanpy as sc\n",
"import os\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.cluster import AgglomerativeClustering\n",
"from sklearn.metrics.cluster import adjusted_rand_score\n",
"from sklearn.metrics.cluster import adjusted_mutual_info_score\n",
"from sklearn.metrics.cluster import homogeneity_score\n",
"import rpy2.robjects as robjects\n",
"from rpy2.robjects import pandas2ri"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10\n"
]
}
],
"source": [
"metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n",
"num_clusters = len(np.unique(metadata['label']))\n",
"print(num_clusters)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_metrics = pd.DataFrame(columns=['ARI','AMI','Homogeneity'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./SnapATAC/clusteringSolution.tsv\n",
"./Cusanovich2018/clusteringSolution.tsv\n",
"./scABC/clusteringSolution.tsv\n",
"./cisTopic/clusteringSolution.tsv\n",
"./Scasat/clusteringSolution.tsv\n",
"./Cicero/clusteringSolution.tsv\n"
]
}
],
"source": [
"df_clusters = pd.DataFrame(index=metadata.index)\n",
"for dirpath, dirnames, filenames in os.walk(\"./\"):\n",
" for filename in [f for f in filenames if(f.endswith(\".tsv\") and f.startswith(\"clustering\"))]:\n",
" print(os.path.join(dirpath, filename))\n",
" df = pd.read_csv(os.path.join(dirpath, filename),sep='\\t',index_col=0)\n",
" df_clusters = pd.merge(df_clusters, df, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SnapATAC | \n",
" cusanovich2018 | \n",
" scABC | \n",
" cisTopic | \n",
" Scasat | \n",
" Cicero | \n",
"
\n",
" \n",
" \n",
" \n",
" BM1077-CLP-Frozen-160106-13 | \n",
" 4 | \n",
" 6 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" BM1077-CLP-Frozen-160106-14 | \n",
" 4 | \n",
" 6 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 9 | \n",
"
\n",
" \n",
" BM1077-CLP-Frozen-160106-2 | \n",
" 4 | \n",
" 6 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" BM1077-CLP-Frozen-160106-21 | \n",
" 4 | \n",
" 6 | \n",
" 6 | \n",
" 1 | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" BM1077-CLP-Frozen-160106-27 | \n",
" 1 | \n",
" 6 | \n",
" 9 | \n",
" 2 | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SnapATAC cusanovich2018 scABC cisTopic \\\n",
"BM1077-CLP-Frozen-160106-13 4 6 6 1 \n",
"BM1077-CLP-Frozen-160106-14 4 6 6 1 \n",
"BM1077-CLP-Frozen-160106-2 4 6 6 1 \n",
"BM1077-CLP-Frozen-160106-21 4 6 6 1 \n",
"BM1077-CLP-Frozen-160106-27 1 6 9 2 \n",
"\n",
" Scasat Cicero \n",
"BM1077-CLP-Frozen-160106-13 1 4 \n",
"BM1077-CLP-Frozen-160106-14 1 9 \n",
"BM1077-CLP-Frozen-160106-2 1 3 \n",
"BM1077-CLP-Frozen-160106-21 1 4 \n",
"BM1077-CLP-Frozen-160106-27 1 4 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_clusters.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SnapATAC\n",
"cusanovich2018\n",
"scABC\n",
"cisTopic\n",
"Scasat\n",
"Cicero\n"
]
}
],
"source": [
"for method in df_clusters.columns:\n",
" print(method)\n",
"\n",
" #adjusted rank index\n",
" ari = adjusted_rand_score(metadata['label'], df_clusters[method])\n",
"\n",
" #adjusted mutual information\n",
" ami = adjusted_mutual_info_score(metadata['label'], df_clusters[method],average_method='arithmetic')\n",
" \n",
" #homogeneity\n",
" homo = homogeneity_score(metadata['label'], df_clusters[method])\n",
"\n",
" df_metrics.loc[method,'ARI'] = ari\n",
" df_metrics.loc[method,'AMI'] = ami\n",
" df_metrics.loc[method,'Homogeneity'] = homo"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ARI | \n",
" AMI | \n",
" Homogeneity | \n",
"
\n",
" \n",
" \n",
" \n",
" SnapATAC | \n",
" 0.323942 | \n",
" 0.587034 | \n",
" 0.559376 | \n",
"
\n",
" \n",
" cusanovich2018 | \n",
" 0.48362 | \n",
" 0.662329 | \n",
" 0.68703 | \n",
"
\n",
" \n",
" scABC | \n",
" 0.270214 | \n",
" 0.464873 | \n",
" 0.446248 | \n",
"
\n",
" \n",
" cisTopic | \n",
" 0.51701 | \n",
" 0.661236 | \n",
" 0.682697 | \n",
"
\n",
" \n",
" Scasat | \n",
" 0.111576 | \n",
" 0.324815 | \n",
" 0.328444 | \n",
"
\n",
" \n",
" Cicero | \n",
" 0.22272 | \n",
" 0.349726 | \n",
" 0.352056 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ARI AMI Homogeneity\n",
"SnapATAC 0.323942 0.587034 0.559376\n",
"cusanovich2018 0.48362 0.662329 0.68703\n",
"scABC 0.270214 0.464873 0.446248\n",
"cisTopic 0.51701 0.661236 0.682697\n",
"Scasat 0.111576 0.324815 0.328444\n",
"Cicero 0.22272 0.349726 0.352056"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_metrics"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df_metrics.to_csv('./clustering_scores.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:ATACseq_clustering]",
"language": "python",
"name": "conda-env-ATACseq_clustering-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}