{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# k-means" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## 1. k-means 란?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 참고: http://shabal.in/visuals/kmeans/2.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. scikit-learn을 활용한 k-means 수행" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- iris_data 사용\n", "- scikit-learn의 KMeans를 활용한 iris 전체 데이터에 대한 클러스터링\n", "- 각 인스턴스의 특징값만으로 클러스터링을 수행하여 그 결과를 kmeans.labels\\_ 에 클러스터링 인덱스를 ndarray 자료구조를 사용하여 표현함 " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", " 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0\n", " 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0\n", " 0 2]\n", "\n", "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n", " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n", " 2 2]\n" ] } ], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import random\n", "from sklearn.cluster import KMeans\n", "from sklearn import datasets\n", "\n", "iris = datasets.load_iris()\n", "feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')\n", "num_feature = len(feature_names)\n", "\n", "kmeans = KMeans(n_clusters=3)\n", "kmeans.fit(iris.data)\n", "print kmeans.labels_\n", "print\n", "print iris.target" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. 정확도 (Accuracy) 분석" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 실제 타겟 그룹 인덱스와 Kmeans에 의해 클러스터링된 그룹 인덱스에 차이가 발생하므로 두 인덱스 사이의 Match 관계를 만들어줌" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{0: [52, 77, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148], 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 2: [50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]}\n", "\n", "{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 1: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 2: [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}\n", "\n", "{0: 2, 1: 0, 2: 1}\n" ] } ], "source": [ "import math\n", "import operator\n", "\n", "def euclideanDistance(instance1, instance2):\n", " distance = 0\n", " for x in range(num_feature):\n", " distance += pow((instance1[x] - instance2[x]), 2)\n", " return math.sqrt(distance)\n", "\n", "def getGroupMatch(group1, group2):\n", " numGroupsOfGroups1 = len(np.unique(group1))\n", " numGroupsOfGroups2 = len(np.unique(group2))\n", " group1_dict = {}\n", " group2_dict = {}\n", " for i in range(numGroupsOfGroups1):\n", " group1_dict[i] = []\n", " for i in range(numGroupsOfGroups2):\n", " group2_dict[i] = [] \n", "\n", " index = 0\n", " for i in group1:\n", " group1_dict[i].append(index)\n", " index += 1\n", "\n", " index = 0\n", " for i in group2:\n", " group2_dict[i].append(index)\n", " index += 1\n", "\n", " group_match = {} ## actual_group_index : kmeans_group_index\n", " for i in range(len(group1_dict)):\n", " distance_set = []\n", " for j in range(len(group2_dict)):\n", " distance_set.append((j, euclideanDistance(group1_dict[i], group2_dict[j])))\n", " distance_set.sort(key = operator.itemgetter(1))\n", " group_match[i] = distance_set[0][0]\n", "\n", " return group1_dict, group2_dict, group_match\n", "\n", "group1_dict, group2_dict, group_match = getGroupMatch(kmeans.labels_, iris.target)\n", "print group1_dict\n", "print\n", "print group2_dict\n", "print\n", "print group_match" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 89.3333333333%\n" ] } ], "source": [ "def getAccuracy(group1, group2, numData):\n", " group1_dict, group2_dict, group_match = getGroupMatch(group1, group2)\n", " correct = 0.0\n", " for i in range(len(group1_dict)):\n", " for index in group1_dict[i]:\n", " if index in group2_dict[group_match[i]]:\n", " correct += 1.0\n", " return correct / float(numData) * 100.0\n", "\n", "accuacy = getAccuracy(kmeans.labels_, iris.target, len(iris.data))\n", "print('Accuracy: ' + str(accuacy) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. 훈련 데이터를 통한 학습과 테스트 데이터의 분류 검증" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 훈련 데이터와 테스트 데이터의 분리" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 94\n", "Test: 56\n", "\n", "[array([ 5.1, 3.5, 1.4, 0.2]), array([ 4.7, 3.2, 1.3, 0.2]), array([ 4.6, 3.1, 1.5, 0.2]), array([ 5. , 3.6, 1.4, 0.2]), array([ 5.4, 3.9, 1.7, 0.4]), array([ 4.6, 3.4, 1.4, 0.3]), array([ 5. , 3.4, 1.5, 0.2]), array([ 4.4, 2.9, 1.4, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 4.8, 3.4, 1.6, 0.2]), array([ 4.8, 3. , 1.4, 0.1]), array([ 4.3, 3. , 1.1, 0.1]), array([ 5.8, 4. , 1.2, 0.2]), array([ 5.4, 3.9, 1.3, 0.4]), array([ 5.1, 3.5, 1.4, 0.3]), array([ 5.1, 3.8, 1.5, 0.3]), array([ 5.4, 3.4, 1.7, 0.2]), array([ 5.1, 3.7, 1.5, 0.4]), array([ 4.6, 3.6, 1. , 0.2]), array([ 5.1, 3.3, 1.7, 0.5]), array([ 5. , 3. , 1.6, 0.2]), array([ 5.2, 3.5, 1.5, 0.2]), array([ 5.5, 4.2, 1.4, 0.2]), array([ 5. , 3.2, 1.2, 0.2]), array([ 5. , 3.5, 1.3, 0.3]), array([ 5. , 3.5, 1.6, 0.6]), array([ 5.1, 3.8, 1.9, 0.4]), array([ 4.8, 3. , 1.4, 0.3]), array([ 5.1, 3.8, 1.6, 0.2]), array([ 5.3, 3.7, 1.5, 0.2]), array([ 7. , 3.2, 4.7, 1.4]), array([ 6.9, 3.1, 4.9, 1.5]), array([ 6.5, 2.8, 4.6, 1.5]), array([ 5.7, 2.8, 4.5, 1.3]), array([ 6.6, 2.9, 4.6, 1.3]), array([ 5.2, 2.7, 3.9, 1.4]), array([ 5. , 2. , 3.5, 1. ]), array([ 6. , 2.2, 4. , 1. ]), array([ 6.1, 2.9, 4.7, 1.4]), array([ 5.6, 2.9, 3.6, 1.3]), array([ 6.7, 3.1, 4.4, 1.4]), array([ 5.6, 3. , 4.5, 1.5]), array([ 5.8, 2.7, 4.1, 1. ]), array([ 5.9, 3.2, 4.8, 1.8]), array([ 6.3, 2.5, 4.9, 1.5]), array([ 6.1, 2.8, 4.7, 1.2]), array([ 6.6, 3. , 4.4, 1.4]), array([ 6. , 2.9, 4.5, 1.5]), array([ 5.5, 2.4, 3.8, 1.1]), array([ 5.5, 2.4, 3.7, 1. ]), array([ 5.8, 2.7, 3.9, 1.2]), array([ 5.4, 3. , 4.5, 1.5]), array([ 6. , 3.4, 4.5, 1.6]), array([ 6.7, 3.1, 4.7, 1.5]), array([ 5.5, 2.5, 4. , 1.3]), array([ 5.5, 2.6, 4.4, 1.2]), array([ 5.8, 2.6, 4. , 1.2]), array([ 5.6, 2.7, 4.2, 1.3]), array([ 5.7, 2.9, 4.2, 1.3]), array([ 6.2, 2.9, 4.3, 1.3]), array([ 5.1, 2.5, 3. , 1.1]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 7.1, 3. , 5.9, 2.1]), array([ 6.3, 2.9, 5.6, 1.8]), array([ 6.5, 3. , 5.8, 2.2]), array([ 7.6, 3. , 6.6, 2.1]), array([ 4.9, 2.5, 4.5, 1.7]), array([ 6.7, 2.5, 5.8, 1.8]), array([ 6.5, 3.2, 5.1, 2. ]), array([ 6.4, 2.7, 5.3, 1.9]), array([ 6.8, 3. , 5.5, 2.1]), array([ 5.8, 2.8, 5.1, 2.4]), array([ 6.4, 3.2, 5.3, 2.3]), array([ 7.7, 2.6, 6.9, 2.3]), array([ 6. , 2.2, 5. , 1.5]), array([ 6.9, 3.2, 5.7, 2.3]), array([ 5.6, 2.8, 4.9, 2. ]), array([ 6.3, 2.7, 4.9, 1.8]), array([ 7.2, 3.2, 6. , 1.8]), array([ 6.1, 3. , 4.9, 1.8]), array([ 6.4, 2.8, 5.6, 2.1]), array([ 7.2, 3. , 5.8, 1.6]), array([ 7.9, 3.8, 6.4, 2. ]), array([ 6.4, 2.8, 5.6, 2.2]), array([ 6.1, 2.6, 5.6, 1.4]), array([ 7.7, 3. , 6.1, 2.3]), array([ 6.3, 3.4, 5.6, 2.4]), array([ 6. , 3. , 4.8, 1.8]), array([ 6.7, 3.1, 5.6, 2.4]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 6.7, 3. , 5.2, 2.3]), array([ 6.5, 3. , 5.2, 2. ]), array([ 6.2, 3.4, 5.4, 2.3]), array([ 5.9, 3. , 5.1, 1.8])]\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", "\n", "[array([ 4.9, 3. , 1.4, 0.2]), array([ 5.4, 3.7, 1.5, 0.2]), array([ 5.7, 4.4, 1.5, 0.4]), array([ 5.7, 3.8, 1.7, 0.3]), array([ 4.8, 3.4, 1.9, 0.2]), array([ 5. , 3.4, 1.6, 0.4]), array([ 5.2, 3.4, 1.4, 0.2]), array([ 4.7, 3.2, 1.6, 0.2]), array([ 4.8, 3.1, 1.6, 0.2]), array([ 5.4, 3.4, 1.5, 0.4]), array([ 5.2, 4.1, 1.5, 0.1]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 5.5, 3.5, 1.3, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 4.4, 3. , 1.3, 0.2]), array([ 5.1, 3.4, 1.5, 0.2]), array([ 4.5, 2.3, 1.3, 0.3]), array([ 4.4, 3.2, 1.3, 0.2]), array([ 4.6, 3.2, 1.4, 0.2]), array([ 5. , 3.3, 1.4, 0.2]), array([ 6.4, 3.2, 4.5, 1.5]), array([ 5.5, 2.3, 4. , 1.3]), array([ 6.3, 3.3, 4.7, 1.6]), array([ 4.9, 2.4, 3.3, 1. ]), array([ 5.9, 3. , 4.2, 1.5]), array([ 6.2, 2.2, 4.5, 1.5]), array([ 5.6, 2.5, 3.9, 1.1]), array([ 6.1, 2.8, 4. , 1.3]), array([ 6.4, 2.9, 4.3, 1.3]), array([ 6.8, 2.8, 4.8, 1.4]), array([ 6.7, 3. , 5. , 1.7]), array([ 5.7, 2.6, 3.5, 1. ]), array([ 6. , 2.7, 5.1, 1.6]), array([ 6.3, 2.3, 4.4, 1.3]), array([ 5.6, 3. , 4.1, 1.3]), array([ 6.1, 3. , 4.6, 1.4]), array([ 5. , 2.3, 3.3, 1. ]), array([ 5.7, 3. , 4.2, 1.2]), array([ 5.7, 2.8, 4.1, 1.3]), array([ 6.3, 3.3, 6. , 2.5]), array([ 7.3, 2.9, 6.3, 1.8]), array([ 7.2, 3.6, 6.1, 2.5]), array([ 5.7, 2.5, 5. , 2. ]), array([ 6.5, 3. , 5.5, 1.8]), array([ 7.7, 3.8, 6.7, 2.2]), array([ 7.7, 2.8, 6.7, 2. ]), array([ 6.7, 3.3, 5.7, 2.1]), array([ 6.2, 2.8, 4.8, 1.8]), array([ 7.4, 2.8, 6.1, 1.9]), array([ 6.3, 2.8, 5.1, 1.5]), array([ 6.4, 3.1, 5.5, 1.8]), array([ 6.9, 3.1, 5.4, 2.1]), array([ 6.9, 3.1, 5.1, 2.3]), array([ 6.8, 3.2, 5.9, 2.3]), array([ 6.7, 3.3, 5.7, 2.5]), array([ 6.3, 2.5, 5. , 1.9])]\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n" ] } ], "source": [ "iris = datasets.load_iris()\n", "split = 0.66\n", "iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')\n", "\n", "def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):\n", " for i in range(len(iris.data)):\n", " if random.random() < split:\n", " training_feature_set.append(iris.data[i])\n", " training_target_set.append(iris.target[i])\n", " else:\n", " test_feature_set.append(iris.data[i])\n", " test_target_set.append(iris.target[i])\n", " return training_feature_set, training_target_set, test_feature_set, test_target_set\n", "\n", "\n", "training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)\n", "print 'Train: ' + str(len(training_feature_set))\n", "print 'Test: ' + str(len(test_feature_set))\n", "print\n", "print training_feature_set\n", "print training_target_set\n", "print\n", "print test_feature_set\n", "print test_target_set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 훈련 데이터만 kmeans에 넣어 분류\n", "- 테스트 데이터를 kmeans.predict()에 넣어 kmeans_target_set 얻음\n", "- kmeans_target_set과 test_target_set과의 정확도 산출" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0]\n", "\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", "\n", "Accuracy: 83.6734693878%\n" ] } ], "source": [ "from sklearn.cluster import KMeans\n", "\n", "kmeans = KMeans(n_clusters=3)\n", "kmeans.fit(training_feature_set)\n", "kmeans_target_set = kmeans.predict(test_feature_set).tolist()\n", "print kmeans_target_set\n", "print\n", "print test_target_set\n", "print\n", "\n", "accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))\n", "print('Accuracy: ' + str(accuracy) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 전체 코드" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Accuracy: 88.2586824684%\n" ] } ], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import random\n", "from sklearn.cluster import KMeans\n", "from sklearn import datasets\n", "\n", "iris = datasets.load_iris()\n", "split = 0.66\n", "iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')\n", "feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')\n", "num_feature = len(feature_names)\n", "\n", "def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):\n", " for i in range(len(iris.data)):\n", " if random.random() < split:\n", " training_feature_set.append(iris.data[i])\n", " training_target_set.append(iris.target[i])\n", " else:\n", " test_feature_set.append(iris.data[i])\n", " test_target_set.append(iris.target[i])\n", " return training_feature_set, training_target_set, test_feature_set, test_target_set\n", "\n", "if __name__ == '__main__':\n", " num_trials = 3\n", " accuracy_sum = 0.0\n", "\n", " for i in range(num_trials):\n", " training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)\n", " kmeans = KMeans(n_clusters=3)\n", " kmeans.fit(training_feature_set)\n", " kmeans_target_set = kmeans.predict(test_feature_set).tolist()\n", "\n", " accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))\n", " accuracy_sum += accuracy\n", " \n", " print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Refererence" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }