{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PyCaret 2 Clustering Example\n", "This notebook is created using PyCaret 2.0. Last updated : 28-07-2020" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pycaret-nightly-0.39\n" ] } ], "source": [ "# check version\n", "from pycaret.utils import version\n", "version()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Loading Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2005 | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.134281 | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "
1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.629580 | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "
2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.970861 | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "
3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.337347 | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "
4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.566215 | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "
5 rows × 21 columns
\n", "Description | Value | |
---|---|---|
0 | \n", "session_id | \n", "123 | \n", "
1 | \n", "Original Data | \n", "(224, 21) | \n", "
2 | \n", "Missing Values | \n", "False | \n", "
3 | \n", "Numeric Features | \n", "20 | \n", "
4 | \n", "Categorical Features | \n", "1 | \n", "
5 | \n", "Ordinal Features | \n", "False | \n", "
6 | \n", "High Cardinality Features | \n", "False | \n", "
7 | \n", "Transformed Data | \n", "(224, 20) | \n", "
8 | \n", "Numeric Imputer | \n", "mean | \n", "
9 | \n", "Categorical Imputer | \n", "constant | \n", "
10 | \n", "Normalize | \n", "False | \n", "
11 | \n", "Normalize Method | \n", "None | \n", "
12 | \n", "Transformation | \n", "False | \n", "
13 | \n", "Transformation Method | \n", "None | \n", "
14 | \n", "PCA | \n", "False | \n", "
15 | \n", "PCA Method | \n", "None | \n", "
16 | \n", "PCA components | \n", "None | \n", "
17 | \n", "Ignore Low Variance | \n", "False | \n", "
18 | \n", "Combine Rare Levels | \n", "False | \n", "
19 | \n", "Rare Level Threshold | \n", "None | \n", "
20 | \n", "Numeric Binning | \n", "False | \n", "
21 | \n", "Remove Multicollinearity | \n", "False | \n", "
22 | \n", "Multicollinearity Threshold | \n", "None | \n", "
23 | \n", "Group Features | \n", "False | \n", "
\n", " | Name | \n", "Reference | \n", "
---|---|---|
ID | \n", "\n", " | \n", " |
kmeans | \n", "K-Means Clustering | \n", "sklearn.cluster.KMeans | \n", "
ap | \n", "Affinity Propagation | \n", "sklearn.cluster.AffinityPropagation | \n", "
meanshift | \n", "Mean shift Clustering | \n", "sklearn.cluster.MeanShift | \n", "
sc | \n", "Spectral Clustering | \n", "sklearn.cluster.SpectralClustering | \n", "
hclust | \n", "Agglomerative Clustering | \n", "sklearn.cluster.AgglomerativeClustering | \n", "
dbscan | \n", "Density-Based Spatial Clustering | \n", "sklearn.cluster.DBSCAN | \n", "
optics | \n", "OPTICS Clustering | \n", "sklearn.cluster.OPTICS | \n", "
birch | \n", "Birch Clustering | \n", "sklearn.cluster.Birch | \n", "
kmodes | \n", "K-Modes Clustering | \n", "git/nicodv/kmodes | \n", "
\n", " | Metric | \n", "
---|---|
Silhouette | \n", "0.4335 | \n", "
Calinski-Harabasz | \n", "322.9575 | \n", "
Davies-Bouldin | \n", "0.7471 | \n", "
\n", " | Metric | \n", "
---|---|
Silhouette | \n", "-0.3632 | \n", "
Calinski-Harabasz | \n", "1.2468 | \n", "
Davies-Bouldin | \n", "1.2297 | \n", "
\n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "Cluster | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "Cluster 2 | \n", "
1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "Cluster 2 | \n", "
2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "Cluster 0 | \n", "
3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "Cluster 2 | \n", "
4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "Cluster 0 | \n", "
5 rows × 22 columns
\n", "\n", " | Country Name | \n", "1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "... | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "Cluster | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Arab World | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "... | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "Cluster 2 | \n", "
1 | \n", "Caribbean small states | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "... | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "Cluster 2 | \n", "
2 | \n", "Central Europe and the Baltics | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "... | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "Cluster 0 | \n", "
3 | \n", "Early-demographic dividend | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "... | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "Cluster 2 | \n", "
4 | \n", "East Asia & Pacific | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "... | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "Cluster 0 | \n", "
5 rows × 22 columns
\n", "Pipeline(memory=None,\n", " steps=[('dtypes',\n", " DataTypes_Auto_infer(categorical_features=[],\n", " display_types=True,\n", " features_todrop=['Country Name'],\n", " ml_usecase='regression',\n", " numerical_features=[],\n", " target='dummy_target',\n", " time_features=[])),\n", " ('imputer',\n", " Simple_Imputer(categorical_strategy='not_available',\n", " numeric_strategy='mean',\n", " target_variable=None)),\n", " ('new_levels1',\n", " New_...\n", " target='dummy_target')),\n", " ('feature_time',\n", " Make_Time_Features(list_of_features=None, time_feature=[])),\n", " ('group', Empty()), ('scaling', Empty()),\n", " ('P_transform', Empty()), ('binn', Empty()),\n", " ('fix_perfect', Empty()), ('rem_outliers', Empty()),\n", " ('dummy', Dummify(target='dummy_target')),\n", " ('clean_names', Clean_Colum_Names()), ('fix_multi', Empty()),\n", " ('pca', Empty())],\n", " verbose=False)
DataTypes_Auto_infer(features_todrop=['Country Name'], ml_usecase='regression',\n", " target='dummy_target')
Simple_Imputer(categorical_strategy='not_available', numeric_strategy='mean',\n", " target_variable=None)
New_Catagorical_Levels_in_TestData(replacement_strategy='least frequent',\n", " target='dummy_target')
Empty()
Empty()
Empty()
Empty()
New_Catagorical_Levels_in_TestData(replacement_strategy='least frequent',\n", " target='dummy_target')
Make_Time_Features(list_of_features=None)
Empty()
Empty()
Empty()
Empty()
Empty()
Empty()
Dummify(target='dummy_target')
Clean_Colum_Names()
Empty()
Empty()
\n", " | 1995 | \n", "1996 | \n", "1997 | \n", "1998 | \n", "1999 | \n", "2000 | \n", "2001 | \n", "2002 | \n", "2003 | \n", "2004 | \n", "2005 | \n", "2006 | \n", "2007 | \n", "2008 | \n", "2009 | \n", "2010 | \n", "2011 | \n", "2012 | \n", "2013 | \n", "2014 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2.004868 | \n", "2.014602 | \n", "2.071309 | \n", "2.177712 | \n", "2.331000 | \n", "2.333596 | \n", "2.588751 | \n", "2.540238 | \n", "2.450415 | \n", "2.314914 | \n", "2.134281 | \n", "2.133038 | \n", "2.166872 | \n", "2.101233 | \n", "2.830067 | \n", "2.489631 | \n", "2.539570 | \n", "2.711262 | \n", "2.895427 | \n", "3.073161 | \n", "
1 | \n", "2.801518 | \n", "2.856684 | \n", "2.997157 | \n", "2.989451 | \n", "2.767858 | \n", "2.826752 | \n", "2.679969 | \n", "2.888693 | \n", "2.740593 | \n", "2.845971 | \n", "2.629580 | \n", "2.650900 | \n", "2.790665 | \n", "2.822913 | \n", "3.408651 | \n", "3.264064 | \n", "3.087653 | \n", "3.314303 | \n", "3.318432 | \n", "3.260012 | \n", "
2 | \n", "4.678528 | \n", "4.753209 | \n", "4.604574 | \n", "4.499988 | \n", "4.679082 | \n", "4.539711 | \n", "4.666272 | \n", "4.900196 | \n", "5.100249 | \n", "4.913629 | \n", "4.970861 | \n", "4.841450 | \n", "4.809057 | \n", "5.054785 | \n", "5.394921 | \n", "5.284380 | \n", "5.096212 | \n", "5.041317 | \n", "5.029266 | \n", "5.017717 | \n", "
3 | \n", "2.203164 | \n", "2.156632 | \n", "2.227311 | \n", "2.364100 | \n", "2.454394 | \n", "2.450555 | \n", "2.527081 | \n", "2.347702 | \n", "2.363263 | \n", "2.346824 | \n", "2.337347 | \n", "2.369884 | \n", "2.385251 | \n", "2.405126 | \n", "2.701260 | \n", "2.507131 | \n", "2.495491 | \n", "2.497340 | \n", "2.586701 | \n", "2.665603 | \n", "
4 | \n", "4.429090 | \n", "4.203152 | \n", "4.244351 | \n", "4.453984 | \n", "4.626920 | \n", "4.688849 | \n", "4.684790 | \n", "4.613537 | \n", "4.635098 | \n", "4.626696 | \n", "4.566215 | \n", "4.367146 | \n", "4.297394 | \n", "4.434848 | \n", "4.865241 | \n", "4.775817 | \n", "4.871727 | \n", "4.866869 | \n", "4.643221 | \n", "4.571448 | \n", "