{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['setosa', 'versicolor', 'virginica'], dtype='" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(9, 3.5))\n", "\n", "# 1. 꽃의 종류를 구분해서 보자.\n", "plt.subplot(121)\n", "plt.plot(X[y==0, 2], X[y==0, 3], \"yo\", label=\"Iris-Setosa\")\n", "plt.plot(X[y==1, 2], X[y==1, 3], \"bs\", label=\"Iris-Versicolor\")\n", "plt.plot(X[y==2, 2], X[y==2, 3], \"g^\", label=\"Iris-Virginica\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", "plt.legend(fontsize=12)\n", "\n", "# 2. 꽃의 종류를 구분하지말고 보자.\n", "plt.subplot(122)\n", "plt.scatter(X[:, 2], X[:, 3], c=\"k\", marker=\".\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.tick_params(labelleft=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Gaussian mixture model" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from sklearn.mixture import GaussianMixture\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(150, 4)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = GaussianMixture(n_components=3, random_state=42).fit(X).predict(X)\n", "y_pred" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], \"yo\", label=\"Cluster 1\")\n", "plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], \"bs\", label=\"Cluster 2\")\n", "plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], \"g^\", label=\"Cluster 3\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", "plt.legend(loc=\"best\", fontsize=12)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "정확도를 살펴보자. Plot 정보를 살펴보면 알겠지만, 원래 클래스 `2, 0, 1`이 `0, 1, 2`로 바꼈다." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "y_pred = [[2, 0, 1][i] for i in y_pred]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "145" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sum(y_pred==y)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9666666666666667" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sum(y_pred==y) / len(y_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Clustering for Preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. 데이터 셋 로딩" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_digits" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "X_digits, y_digits = load_digits(return_X_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 학습셋 / 테스트셋 구분" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. 학습" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### (1) 데이터 전처리 없이 학습" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=42, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9666666666666667" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### (2) K-Means를 이용하여 데이터 전처리 후 학습" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", " n_clusters=50, n_init=10, n_jobs=1, precompute_distances='auto',\n", " random_state=42, tol=0.0001, verbose=0)), ('log_reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=42, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False))])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = Pipeline([\n", " (\"kmeans\", KMeans(n_clusters=50, random_state=42)),\n", " (\"log_reg\", LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)),\n", "])\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9822222222222222" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grid search를 이용하여 최상의 $k$를 찾자." ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=Pipeline(memory=None,\n", " steps=[('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", " n_clusters=50, n_init=10, n_jobs=1, precompute_distances='auto',\n", " random_state=42, tol=0.0001, verbose=0)), ('log_reg', LogisticRegression(C=1.0, class_weight=None, dua...lty='l2', random_state=42, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False))]),\n", " fit_params=None, iid=True, n_jobs=1,\n", " param_grid={'kmeans__n_clusters': range(2, 100)},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", " scoring=None, verbose=0)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "param_grid = dict(kmeans__n_clusters=range(2, 100))\n", "grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=0)\n", "grid_clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'kmeans__n_clusters': 90}" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_clf.best_params_" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9844444444444445" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_clf.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$k=90$일 때, 성능이 근소하게나마 상승했다." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clustering for Semi-supervised Learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "다수의 unlabeled instances와 매우 적은 labeled instances를 가지고 있을 때, semi-supervised learning을 clustering을 이용하여 수행할 수 있습니다." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(1) 데이터는 아까의 image 데이터이며, 50개의 instance에 대해서만 label을 가지고 있다고 가정하겠습니다." ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "n_labeled = 50" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8266666666666667" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])\n", "log_reg.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "k = 50" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(2) Unlabeled + labeled data에서 clustering을 수행합니다." ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[31.33905994, 41.96889871, 39.35726202, ..., 52.16098656,\n", " 43.01099731, 47.0563954 ],\n", " [55.92396284, 43.71133883, 54.31262046, ..., 42.91000488,\n", " 43.32075513, 47.61348001],\n", " [42.70646977, 41.11611364, 18.08113402, ..., 46.43384382,\n", " 43.07657195, 51.59794401],\n", " ...,\n", " [52.03749979, 35.45795409, 47.95964353, ..., 39.43266647,\n", " 43.96234554, 36.1470427 ],\n", " [31.697396 , 34.37452504, 41.17381404, ..., 44.60183936,\n", " 39.03981532, 36.47989321],\n", " [43.72529456, 43.97780117, 47.90725608, ..., 53.60184146,\n", " 18.23619458, 26.30919826]])" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmeans = KMeans(n_clusters=k, random_state=42)\n", "X_digits_dist = kmeans.fit_transform(X_train)\n", "X_digits_dist" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(3) 각 cluster의 centroid에 대하여 가장 가까운 images를 찾습니다." ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 911, 559, 23, 159, 736, 1056, 776, 795, 753, 598, 737,\n", " 683, 1194, 602, 817, 1284, 73, 702, 94, 891, 805, 1071,\n", " 1314, 1022, 1050, 525, 588, 481, 1005, 766, 848, 731, 749,\n", " 1322, 1336, 705, 1151, 494, 357, 459, 843, 850, 151, 256,\n", " 576, 460, 596, 648, 841, 214])" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "representative_digit_idx = np.argmin(X_digits_dist, axis=0)\n", "representative_digit_idx" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0., 0., 0., ..., 0., 0., 0.],\n", " [ 0., 0., 2., ..., 14., 2., 0.],\n", " [ 0., 0., 4., ..., 6., 0., 0.],\n", " ...,\n", " [ 0., 0., 4., ..., 9., 1., 0.],\n", " [ 0., 0., 6., ..., 3., 0., 0.],\n", " [ 0., 0., 1., ..., 9., 0., 0.]])" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_representative_digits = X_train[representative_digit_idx]\n", "# 각 centroid와 가장 가깝기 때문에, 각 cluster를 대표하는 image입니다.\n", "X_representative_digits" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(4) 해당 images에 대하여 직접 라벨링을 합니다." ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(8, 2))\n", "for index, X_representative_digit in enumerate(X_representative_digits):\n", " plt.subplot(k // 10, 10, index + 1)\n", " plt.imshow(X_representative_digit.reshape(8, 8), cmap=\"binary\", interpolation=\"bilinear\")\n", " plt.axis('off')" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "y_representative_digits = np.array([\n", " 4, 8, 0, 6, 8, 3, 7, 7, 9, 2,\n", " 5, 5, 8, 5, 2, 1, 2, 9, 6, 1,\n", " 1, 6, 9, 0, 8, 3, 0, 7, 4, 1,\n", " 6, 5, 2, 4, 1, 8, 6, 3, 9, 2,\n", " 4, 2, 9, 4, 7, 6, 2, 3, 1, 1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(5) 해당 이미지를 이용하여 학습을 합니다." ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9244444444444444" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_representative_digits, y_representative_digits)\n", "log_reg.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(6) 만약에, 50개의 데이터를 이용하여, 같은 클러스터에 속하는 데이터에 라벨을 할당하면?" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "y_train_propagated = np.empty(len(X_train), dtype=np.int32)\n", "for i in range(k):\n", " y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=42, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train, y_train_propagated)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9288888888888889" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Outlier에 취약하다. 따라서 성능의 큰 발전이 있지는 않다." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(7) 만약에 같은 클러스터에 속하는 데이터 전체에 라벨을 할당하는 것이 아니라, centroid에 가까운 데이터에 대해서만 라벨을 할당하면?" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1347,)\n" ] }, { "data": { "text/plain": [ "array([30.3917992 , 20.3734662 , 15.08582969, ..., 19.36276495,\n", " 19.5626378 , 18.23619458])" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 각 instance에서 가장 가까운 클러스터까지의 거리를 가지고 옵니다.\n", "X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]\n", "\n", "print(X_cluster_dist.shape)\n", "X_cluster_dist" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "percentile_closest = 20\n", "\n", "for i in range(k):\n", " # 클러스터에 해당하는 데이터만 불러옵니다.\n", " in_cluster = (kmeans.labels_ == i)\n", " cluster_dist = X_cluster_dist[in_cluster]\n", " # cluster내에서 상위 20퍼센트에 해당하는 거리(cutoff_distance) 찾습니다.\n", " cutoff_distance = np.percentile(cluster_dist, percentile_closest)\n", " # 임계치보다 큰 거리는 -1로 바꿉니다.\n", " above_cutoff = (X_cluster_dist > cutoff_distance)\n", " X_cluster_dist[in_cluster & above_cutoff] = -1" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "partially_propagated = (X_cluster_dist != -1)\n", "# X_train은 이전 예제에서 클러스터에 할당된 데이터들에 대해 클러스터가 부여되었습니다. 그 중에서 상위 20%만 선택합니다.\n", "X_train_partially_propagated = X_train[partially_propagated]\n", "y_train_partially_propagated = y_train_propagated[partially_propagated]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=42, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9422222222222222" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_reg.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "상위 20%에 할당된 라벨의 accuracy는 상당히 높다." ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9896907216494846" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(y_train_partially_propagated == y_train[partially_propagated])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }