{ "cells": [ { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from kneed import KneeLocator\n", "from sklearn.datasets import make_blobs\n", "from sklearn.cluster import KMeans\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.decomposition import PCA\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import data" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "url = 'https://raw.githubusercontent.com/arofiqimaulana/dataset/master/iris.csv'\n", "df = pd.read_csv(url)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)targetflower_name
05.13.51.40.20setosa
14.93.01.40.20setosa
24.73.21.30.20setosa
34.63.11.50.20setosa
45.03.61.40.20setosa
\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", "0 5.1 3.5 1.4 0.2 \n", "1 4.9 3.0 1.4 0.2 \n", "2 4.7 3.2 1.3 0.2 \n", "3 4.6 3.1 1.5 0.2 \n", "4 5.0 3.6 1.4 0.2 \n", "\n", " target flower_name \n", "0 0 setosa \n", "1 0 setosa \n", "2 0 setosa \n", "3 0 setosa \n", "4 0 setosa " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "X = df[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Standarisasi" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", "scaled_features = scaler.fit_transform(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clustering\n", "Meskipun k-mean tidak membutuhkan variabel Y, namun kita bisa melakukan validasi menggunakan data berlabel seperti data iris." ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "kmeans = KMeans(\n", " init=\"random\",\n", " n_clusters=3,\n", " n_init=10,\n", " max_iter=300,\n", " random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KMeans(init='random', n_clusters=3, random_state=42)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmeans.fit(scaled_features)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "labels = pd.DataFrame(kmeans.labels_)\n", "labels.columns = ['label_kmeans']" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "df['labels'] = labels" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)targetflower_namelabels
05.13.51.40.20setosa1
14.93.01.40.20setosa1
24.73.21.30.20setosa1
34.63.11.50.20setosa1
45.03.61.40.20setosa1
........................
1456.73.05.22.32virginica0
1466.32.55.01.92virginica2
1476.53.05.22.02virginica0
1486.23.45.42.32virginica0
1495.93.05.11.82virginica2
\n", "

150 rows × 7 columns

\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", "0 5.1 3.5 1.4 0.2 \n", "1 4.9 3.0 1.4 0.2 \n", "2 4.7 3.2 1.3 0.2 \n", "3 4.6 3.1 1.5 0.2 \n", "4 5.0 3.6 1.4 0.2 \n", ".. ... ... ... ... \n", "145 6.7 3.0 5.2 2.3 \n", "146 6.3 2.5 5.0 1.9 \n", "147 6.5 3.0 5.2 2.0 \n", "148 6.2 3.4 5.4 2.3 \n", "149 5.9 3.0 5.1 1.8 \n", "\n", " target flower_name labels \n", "0 0 setosa 1 \n", "1 0 setosa 1 \n", "2 0 setosa 1 \n", "3 0 setosa 1 \n", "4 0 setosa 1 \n", ".. ... ... ... \n", "145 2 virginica 0 \n", "146 2 virginica 2 \n", "147 2 virginica 0 \n", "148 2 virginica 0 \n", "149 2 virginica 2 \n", "\n", "[150 rows x 7 columns]" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize using PCA\n", "Visualisasi data paling mudah menggunakan grafik 2 dimensi. Karena kita punya lebih dari 2 variabel, maka kita bisa gunakan PCA untuk menjadikannya menjadi 2 variabel baru." ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "pca = PCA(2)\n", "\n", "arr_pca = pca.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "df_pca = pd.DataFrame(arr_pca)\n", "df_pca.columns = ['PC1','PC2']\n", "df_pca['labels'] = labels" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.scatterplot(data=df_pca, x=\"PC1\", y=\"PC2\", hue=\"labels\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pemilihan k terbaik\n", "Pemilihan k terbaik bisa menggunakan teknik silhouette. Teknik ini menggunakan SSE (Sum of Square) di setiap nilai k. SSE yang sudah tidak terlalu jauh, maka bisa katakan k itulah yang terbaik. \n", "\n", "Berdasarkan grafik di bawah ini, bisa kita katakan bahwa jumlah cluster paling optimal adalah 3. Hal ini karena nilai SSE yang sudah tidak terlalu jauh jika kita pake k=4,k=5,k=6 dst." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "kmeans_kwargs = {\n", " \"init\": \"random\",\n", " \"n_init\": 10,\n", " \"max_iter\": 300,\n", " \"random_state\": 42\n", "}" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n", " warnings.warn(\n" ] } ], "source": [ "sse = []\n", "for k in range(1, 11):\n", " kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n", " kmeans.fit(scaled_features)\n", " sse.append(kmeans.inertia_)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.style.use(\"fivethirtyeight\")\n", "plt.plot(range(1, 11), sse)\n", "plt.xticks(range(1, 11))\n", "plt.xlabel(\"Number of Clusters\")\n", "plt.ylabel(\"SSE\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reference\n", "- https://realpython.com/k-means-clustering-python/\n", "- https://www.askpython.com/python/examples/plot-k-means-clusters-python\n", "- https://seaborn.pydata.org/generated/seaborn.scatterplot.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }