{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PyCaret 2 Clustering Example\n", "This notebook is created using PyCaret 2.0. Last updated : 28-07-2020" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pycaret-nightly-0.39\n" ] } ], "source": [ "# check version\n", "from pycaret.utils import version\n", "version()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Loading Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2005 | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.134281 | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "
| 1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.629580 | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "
| 2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.970861 | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "
| 3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.337347 | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "
| 4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.566215 | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "
5 rows × 21 columns
\n", "| Description | Value | |
|---|---|---|
| 0 | \n", "session_id | \n", "123 | \n", "
| 1 | \n", "Original Data | \n", "(224, 21) | \n", "
| 2 | \n", "Missing Values | \n", "False | \n", "
| 3 | \n", "Numeric Features | \n", "20 | \n", "
| 4 | \n", "Categorical Features | \n", "1 | \n", "
| 5 | \n", "Ordinal Features | \n", "False | \n", "
| 6 | \n", "High Cardinality Features | \n", "False | \n", "
| 7 | \n", "Transformed Data | \n", "(224, 20) | \n", "
| 8 | \n", "Numeric Imputer | \n", "mean | \n", "
| 9 | \n", "Categorical Imputer | \n", "constant | \n", "
| 10 | \n", "Normalize | \n", "False | \n", "
| 11 | \n", "Normalize Method | \n", "None | \n", "
| 12 | \n", "Transformation | \n", "False | \n", "
| 13 | \n", "Transformation Method | \n", "None | \n", "
| 14 | \n", "PCA | \n", "False | \n", "
| 15 | \n", "PCA Method | \n", "None | \n", "
| 16 | \n", "PCA components | \n", "None | \n", "
| 17 | \n", "Ignore Low Variance | \n", "False | \n", "
| 18 | \n", "Combine Rare Levels | \n", "False | \n", "
| 19 | \n", "Rare Level Threshold | \n", "None | \n", "
| 20 | \n", "Numeric Binning | \n", "False | \n", "
| 21 | \n", "Remove Multicollinearity | \n", "False | \n", "
| 22 | \n", "Multicollinearity Threshold | \n", "None | \n", "
| 23 | \n", "Group Features | \n", "False | \n", "
| \n", " | Name | \n", "Reference | \n", "
|---|---|---|
| ID | \n", "\n", " | \n", " |
| kmeans | \n", "K-Means Clustering | \n", "sklearn.cluster.KMeans | \n", "
| ap | \n", "Affinity Propagation | \n", "sklearn.cluster.AffinityPropagation | \n", "
| meanshift | \n", "Mean shift Clustering | \n", "sklearn.cluster.MeanShift | \n", "
| sc | \n", "Spectral Clustering | \n", "sklearn.cluster.SpectralClustering | \n", "
| hclust | \n", "Agglomerative Clustering | \n", "sklearn.cluster.AgglomerativeClustering | \n", "
| dbscan | \n", "Density-Based Spatial Clustering | \n", "sklearn.cluster.DBSCAN | \n", "
| optics | \n", "OPTICS Clustering | \n", "sklearn.cluster.OPTICS | \n", "
| birch | \n", "Birch Clustering | \n", "sklearn.cluster.Birch | \n", "
| kmodes | \n", "K-Modes Clustering | \n", "git/nicodv/kmodes | \n", "
| \n", " | Metric | \n", "
|---|---|
| Silhouette | \n", "0.4335 | \n", "
| Calinski-Harabasz | \n", "322.9575 | \n", "
| Davies-Bouldin | \n", "0.7471 | \n", "
| \n", " | Metric | \n", "
|---|---|
| Silhouette | \n", "-0.3632 | \n", "
| Calinski-Harabasz | \n", "1.2468 | \n", "
| Davies-Bouldin | \n", "1.2297 | \n", "
| \n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "Cluster | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "Cluster 2 | \n", "
| 1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "Cluster 2 | \n", "
| 2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "Cluster 0 | \n", "
| 3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "Cluster 2 | \n", "
| 4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "Cluster 0 | \n", "
5 rows × 22 columns
\n", "| \n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "Cluster | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "Cluster 2 | \n", "
| 1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "Cluster 2 | \n", "
| 2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "Cluster 0 | \n", "
| 3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "Cluster 2 | \n", "
| 4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "Cluster 0 | \n", "
5 rows × 22 columns
\n", "Pipeline(memory=None,\n",
" steps=[('dtypes',\n",
" DataTypes_Auto_infer(categorical_features=[],\n",
" display_types=True,\n",
" features_todrop=['Country Name'],\n",
" ml_usecase='regression',\n",
" numerical_features=[],\n",
" target='dummy_target',\n",
" time_features=[])),\n",
" ('imputer',\n",
" Simple_Imputer(categorical_strategy='not_available',\n",
" numeric_strategy='mean',\n",
" target_variable=None)),\n",
" ('new_levels1',\n",
" New_...\n",
" target='dummy_target')),\n",
" ('feature_time',\n",
" Make_Time_Features(list_of_features=None, time_feature=[])),\n",
" ('group', Empty()), ('scaling', Empty()),\n",
" ('P_transform', Empty()), ('binn', Empty()),\n",
" ('fix_perfect', Empty()), ('rem_outliers', Empty()),\n",
" ('dummy', Dummify(target='dummy_target')),\n",
" ('clean_names', Clean_Colum_Names()), ('fix_multi', Empty()),\n",
" ('pca', Empty())],\n",
" verbose=False)DataTypes_Auto_infer(features_todrop=['Country Name'], ml_usecase='regression',\n",
" target='dummy_target')Simple_Imputer(categorical_strategy='not_available', numeric_strategy='mean',\n",
" target_variable=None)New_Catagorical_Levels_in_TestData(replacement_strategy='least frequent',\n",
" target='dummy_target')Empty()
Empty()
Empty()
Empty()
New_Catagorical_Levels_in_TestData(replacement_strategy='least frequent',\n",
" target='dummy_target')Make_Time_Features(list_of_features=None)
Empty()
Empty()
Empty()
Empty()
Empty()
Empty()
Dummify(target='dummy_target')
Clean_Colum_Names()
Empty()
Empty()
| \n", " | 1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "2004 | \n", "2005 | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "2.314914 | \n", "2.134281 | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "
| 1 | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "2.845971 | \n", "2.629580 | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "
| 2 | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "4.913629 | \n", "4.970861 | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "
| 3 | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "2.346824 | \n", "2.337347 | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "
| 4 | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "4.626696 | \n", "4.566215 | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "