{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# k-Nearest Neighbors (kNN)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## 1. kNN 이란?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 참고: https://www.youtube.com/watch?v=UqYde-LULfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 데이터 다루기 (Data Handling)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1) 데이터 로딩하기\n", "- Iris (붓꽃) 데이터 로딩" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sepal length sepal width petal length petal width class\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa\n", "5 5.4 3.9 1.7 0.4 Iris-setosa\n", "6 4.6 3.4 1.4 0.3 Iris-setosa\n", "7 5.0 3.4 1.5 0.2 Iris-setosa\n", "8 4.4 2.9 1.4 0.2 Iris-setosa\n", "9 4.9 3.1 1.5 0.1 Iris-setosa\n", "10 5.4 3.7 1.5 0.2 Iris-setosa\n", "11 4.8 3.4 1.6 0.2 Iris-setosa\n", "12 4.8 3.0 1.4 0.1 Iris-setosa\n", "13 4.3 3.0 1.1 0.1 Iris-setosa\n", "14 5.8 4.0 1.2 0.2 Iris-setosa\n", "15 5.7 4.4 1.5 0.4 Iris-setosa\n", "16 5.4 3.9 1.3 0.4 Iris-setosa\n", "17 5.1 3.5 1.4 0.3 Iris-setosa\n", "18 5.7 3.8 1.7 0.3 Iris-setosa\n", "19 5.1 3.8 1.5 0.3 Iris-setosa\n", "20 5.4 3.4 1.7 0.2 Iris-setosa\n", "21 5.1 3.7 1.5 0.4 Iris-setosa\n", "22 4.6 3.6 1.0 0.2 Iris-setosa\n", "23 5.1 3.3 1.7 0.5 Iris-setosa\n", "24 4.8 3.4 1.9 0.2 Iris-setosa\n", "25 5.0 3.0 1.6 0.2 Iris-setosa\n", "26 5.0 3.4 1.6 0.4 Iris-setosa\n", "27 5.2 3.5 1.5 0.2 Iris-setosa\n", "28 5.2 3.4 1.4 0.2 Iris-setosa\n", "29 4.7 3.2 1.6 0.2 Iris-setosa\n", ".. ... ... ... ... ...\n", "120 6.9 3.2 5.7 2.3 Iris-virginica\n", "121 5.6 2.8 4.9 2.0 Iris-virginica\n", "122 7.7 2.8 6.7 2.0 Iris-virginica\n", "123 6.3 2.7 4.9 1.8 Iris-virginica\n", "124 6.7 3.3 5.7 2.1 Iris-virginica\n", "125 7.2 3.2 6.0 1.8 Iris-virginica\n", "126 6.2 2.8 4.8 1.8 Iris-virginica\n", "127 6.1 3.0 4.9 1.8 Iris-virginica\n", "128 6.4 2.8 5.6 2.1 Iris-virginica\n", "129 7.2 3.0 5.8 1.6 Iris-virginica\n", "130 7.4 2.8 6.1 1.9 Iris-virginica\n", "131 7.9 3.8 6.4 2.0 Iris-virginica\n", "132 6.4 2.8 5.6 2.2 Iris-virginica\n", "133 6.3 2.8 5.1 1.5 Iris-virginica\n", "134 6.1 2.6 5.6 1.4 Iris-virginica\n", "135 7.7 3.0 6.1 2.3 Iris-virginica\n", "136 6.3 3.4 5.6 2.4 Iris-virginica\n", "137 6.4 3.1 5.5 1.8 Iris-virginica\n", "138 6.0 3.0 4.8 1.8 Iris-virginica\n", "139 6.9 3.1 5.4 2.1 Iris-virginica\n", "140 6.7 3.1 5.6 2.4 Iris-virginica\n", "141 6.9 3.1 5.1 2.3 Iris-virginica\n", "142 5.8 2.7 5.1 1.9 Iris-virginica\n", "143 6.8 3.2 5.9 2.3 Iris-virginica\n", "144 6.7 3.3 5.7 2.5 Iris-virginica\n", "145 6.7 3.0 5.2 2.3 Iris-virginica\n", "146 6.3 2.5 5.0 1.9 Iris-virginica\n", "147 6.5 3.0 5.2 2.0 Iris-virginica\n", "148 6.2 3.4 5.4 2.3 Iris-virginica\n", "149 5.9 3.0 5.1 1.8 Iris-virginica\n", "\n", "[150 rows x 5 columns]\n" ] } ], "source": [ "import urllib2\n", "import json\n", "from scipy import stats\n", "from pandas import Series, DataFrame\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "%matplotlib inline\n", "\n", "path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'\n", "raw_csv = urllib2.urlopen(path)\n", "feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')\n", "all_names = feature_names + ('class',)\n", "df = pd.read_csv(raw_csv, names=all_names)\n", "print df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2) 학습 데이터와 검증 데이터 분리하기" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 98 - ratio: 0.653333333333\n", "Test: 52 - ratio: 0.346666666667\n" ] } ], "source": [ "import random\n", "\n", "def splitDataset(split, df, training_set=[], test_set=[]):\n", " for i in range(len(df)):\n", " if random.random() < split:\n", " training_set.append(df.ix[i])\n", " else:\n", " test_set.append(df.ix[i])\n", " return training_set, test_set \n", "\n", "split = 0.66\n", "training_set, test_set = splitDataset(split, df)\n", "print 'Train: ' + str(len(training_set)) + \" - ratio: \" + str(float(len(training_set))/len(df))\n", "print 'Test: ' + str(len(test_set)) + \" - ratio: \" + str(float(len(test_set))/len(df))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 유사도 (Similarity) 정의" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 유사도 정의\n", " - Euclidean Distance (https://en.wikipedia.org/wiki/Euclidean_distance)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sepal length sepal width petal length petal width\n", "0 5.1 3.5 1.4 0.2\n", "1 4.9 3.0 1.4 0.2\n", "2 4.7 3.2 1.3 0.2\n", "3 4.6 3.1 1.5 0.2\n", "4 5.0 3.6 1.4 0.2\n", "\n", "Distance: 0.538516480713\n" ] } ], "source": [ "num_feature = len(feature_names)\n", "\n", "import math\n", "def euclideanDistance(instance1, instance2):\n", " distance = 0\n", " for x in range(num_feature):\n", " distance += pow((instance1[x] - instance2[x]), 2)\n", " return math.sqrt(distance)\n", "\n", "df_feature = df.drop('class', axis=1)\n", "print df_feature.head()\n", "print \n", "\n", "distance = euclideanDistance(df_feature.ix[0], df_feature.ix[1])\n", "print 'Distance: ' + str(distance)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. k-이웃 (k-Neighbors) 찾기 " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 테스트 집합내 임의의 인스턴스(test_instance)에 대하여 훈련 데이터 집합(training_set)내에서 유사도가 높은 k개의 인스턴스 찾기" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sepal length 4.9\n", "sepal width 3\n", "petal length 1.4\n", "petal width 0.2\n", "class Iris-setosa\n", "Name: 1, dtype: object\n", "\n", "[sepal length 4.8\n", "sepal width 3\n", "petal length 1.4\n", "petal width 0.1\n", "class Iris-setosa\n", "Name: 12, dtype: object]\n" ] } ], "source": [ "import operator \n", "def getNeighbors(training_set, test_instance, k):\n", " distances = []\n", " for i in range(len(training_set)):\n", " dist = euclideanDistance(training_set[i], test_instance)\n", " distances.append((training_set[i], dist))\n", " distances.sort(key=operator.itemgetter(1))\n", " neighbors = []\n", " for i in range(k):\n", " neighbors.append(distances[i][0])\n", " return neighbors\n", "\n", "print test_set[0]\n", "print \n", "\n", "k = 1\n", "neighbors = getNeighbors(training_set, test_set[0], k)\n", "print neighbors" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sepal length 4.8\n", "sepal width 3\n", "petal length 1.4\n", "petal width 0.1\n", "class Iris-setosa\n", "Name: 12, dtype: object\n", "\n", "\n", "\n", "Iris-setosa\n" ] } ], "source": [ "print neighbors[0]\n", "print\n", "print type(neighbors[0])\n", "print\n", "print neighbors[0][-1]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[sepal length 4.8\n", "sepal width 3\n", "petal length 1.4\n", "petal width 0.1\n", "class Iris-setosa\n", "Name: 12, dtype: object, sepal length 4.9\n", "sepal width 3.1\n", "petal length 1.5\n", "petal width 0.1\n", "class Iris-setosa\n", "Name: 9, dtype: object, sepal length 4.9\n", "sepal width 3.1\n", "petal length 1.5\n", "petal width 0.1\n", "class Iris-setosa\n", "Name: 34, dtype: object]\n" ] } ], "source": [ "k = 3\n", "neighbors = getNeighbors(training_set, test_set[0], k)\n", "print(neighbors)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. 분류하기 (Classify)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 테스트 집합(test_set)내 임의의 인스턴스(test_instance)에 대하여...\n", "- 훈련 데이터 집합(training_set)내에서 유사도가 높은 k개의 인스턴스의 분류 중 가장 빈도수가 높은 분류를 해당 인스턴스(test_instance)의 분류로 정하기" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classified: Iris-setosa - Actual: Iris-setosa\n" ] } ], "source": [ "def classify(neighbors):\n", " class_frequency = {}\n", " for i in range(len(neighbors)):\n", " class_name = neighbors[i][-1]\n", " if class_name in class_frequency:\n", " class_frequency[class_name] += 1\n", " else:\n", " class_frequency[class_name] = 1\n", " sorted_class_frequency = sorted(class_frequency.iteritems(), key=operator.itemgetter(1), reverse=True)\n", " return sorted_class_frequency[0][0]\n", "\n", "k = 3\n", "neighbors = getNeighbors(training_set, test_set[0], k)\n", "\n", "classified_class_name = classify(neighbors)\n", "print \"Classified:\", classified_class_name, \"- Actual:\", test_set[0]['class']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. 전체 테스트 집합에 대해 분류 및 정확도 (Accuracy) 평가" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-setosa, Actual:Iris-setosa\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-versicolor, Actual:Iris-versicolor\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-versicolor, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-virginica, Actual:Iris-virginica\n", "Classified:Iris-versicolor, Actual:Iris-virginica\n", "\n", "Accuracy: 96.1538461538%\n" ] } ], "source": [ "k = 3\n", "classified_class_names=[]\n", "for i in range(len(test_set)):\n", " neighbors = getNeighbors(training_set, test_set[i], k)\n", " result = classify(neighbors)\n", " classified_class_names.append(result)\n", " print('Classified:' + result + ', Actual:' + test_set[i][-1])\n", "\n", "correct = 0.0\n", "for i in range(len(test_set)):\n", " if classified_class_names[i] == test_set[i][-1]:\n", " correct += 1.0\n", "\n", "print\n", "print('Accuracy: ' + str(correct / float(len(test_set)) * 100.0) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. kNN 분류 전체 코드" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 중간 과정의 테스트 코드 삭제\n", "\n", "- 보다 정확한 정확도 측정을 위하여 전체적으로 num_trials번의 테스트 후 평균 정확도 산출 " ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Accuracy: 98.6579403572%\n" ] } ], "source": [ "import urllib2\n", "import json\n", "from scipy import stats\n", "from pandas import Series, DataFrame\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import random\n", "import math\n", "import operator\n", "\n", "def splitDataset(split, df, training_set=[] , test_set=[]):\n", " for i in range(len(df)):\n", " if random.random() < split:\n", " training_set.append(df.ix[i])\n", " else:\n", " test_set.append(df.ix[i])\n", " return training_set, test_set \n", "\n", "def euclideanDistance(instance1, instance2):\n", " distance = 0\n", " for x in range(num_feature):\n", " distance += pow((instance1[x] - instance2[x]), 2)\n", " return math.sqrt(distance)\n", "\n", "def getNeighbors(training_set, test_instance, k):\n", " distances = []\n", " for i in range(len(training_set)):\n", " dist = euclideanDistance(training_set[i], test_instance)\n", " distances.append((training_set[i], dist))\n", " distances.sort(key=operator.itemgetter(1))\n", " neighbors = []\n", " for i in range(k):\n", " neighbors.append(distances[i][0])\n", " return neighbors\n", "\n", "def classify(neighbors):\n", " class_frequency = {}\n", " for i in range(len(neighbors)):\n", " class_name = neighbors[i][-1]\n", " if class_name in class_frequency:\n", " class_frequency[class_name] += 1\n", " else:\n", " class_frequency[class_name] = 1\n", " sorted_class_frequency = sorted(class_frequency.iteritems(), key=operator.itemgetter(1), reverse=True)\n", " return sorted_class_frequency[0][0]\n", "\n", "if __name__ == '__main__':\n", " path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'\n", " raw_csv = urllib2.urlopen(path)\n", " feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')\n", " iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')\n", " all_names = feature_names + ('class',)\n", " df = pd.read_csv(raw_csv, names=all_names)\n", " df_feature = df.drop('class', axis=1)\n", " num_feature = len(feature_names)\n", " split = 0.66\n", " k = 3\n", " num_trials = 3\n", " accuracy_sum = 0.0\n", "\n", " for i in range(num_trials):\n", " training_set, test_set = splitDataset(split, df)\n", " classified_class_names=[]\n", " for i in range(len(test_set)):\n", " neighbors = getNeighbors(training_set, test_set[i], k)\n", " result = classify(neighbors)\n", " classified_class_names.append(result)\n", "\n", " correct = 0.0\n", " for i in range(len(test_set)):\n", " if test_set[i][-1] == classified_class_names[i]:\n", " correct += 1.0\n", "\n", " accuracy_sum += (correct / float(len(test_set))) * 100.0\n", "\n", " print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. scikit-learn을 활용한 kNN 수행" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- sklearn.datasets.load_iris()를 제공하여 iris 데이터를 편하게 로드할 수 있음" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 5.1 3.5 1.4 0.2]\n", " [ 4.9 3. 1.4 0.2]\n", " [ 4.7 3.2 1.3 0.2]\n", " [ 4.6 3.1 1.5 0.2]\n", " [ 5. 3.6 1.4 0.2]]\n", "[0 0 0 0 0]\n" ] } ], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn import neighbors, datasets\n", "\n", "iris = datasets.load_iris()\n", "print iris.data[0:5]\n", "print iris.target[0:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- datasets.load_iris()가 반환한 데이터 형태에 맞게 훈련 데이터와 테스트 데이터 구분" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 99 - ratio: 0.66\n", "Test: 51 - ratio: 0.34\n", "\n", "[array([ 5.1, 3.5, 1.4, 0.2]), array([ 4.9, 3. , 1.4, 0.2]), array([ 4.7, 3.2, 1.3, 0.2]), array([ 5.4, 3.9, 1.7, 0.4]), array([ 4.4, 2.9, 1.4, 0.2]), array([ 4.8, 3.4, 1.6, 0.2]), array([ 4.8, 3. , 1.4, 0.1]), array([ 4.3, 3. , 1.1, 0.1]), array([ 5.8, 4. , 1.2, 0.2]), array([ 5.7, 4.4, 1.5, 0.4]), array([ 5.4, 3.9, 1.3, 0.4]), array([ 5.1, 3.5, 1.4, 0.3]), array([ 5.7, 3.8, 1.7, 0.3]), array([ 5.1, 3.8, 1.5, 0.3]), array([ 5.4, 3.4, 1.7, 0.2]), array([ 5.1, 3.7, 1.5, 0.4]), array([ 4.6, 3.6, 1. , 0.2]), array([ 5.1, 3.3, 1.7, 0.5]), array([ 4.8, 3.4, 1.9, 0.2]), array([ 5. , 3. , 1.6, 0.2]), array([ 5.2, 3.5, 1.5, 0.2]), array([ 4.7, 3.2, 1.6, 0.2]), array([ 5.4, 3.4, 1.5, 0.4]), array([ 5.5, 4.2, 1.4, 0.2]), array([ 5. , 3.2, 1.2, 0.2]), array([ 5.5, 3.5, 1.3, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 4.4, 3. , 1.3, 0.2]), array([ 5.1, 3.4, 1.5, 0.2]), array([ 4.5, 2.3, 1.3, 0.3]), array([ 5.1, 3.8, 1.6, 0.2]), array([ 5.3, 3.7, 1.5, 0.2]), array([ 6.4, 3.2, 4.5, 1.5]), array([ 5.5, 2.3, 4. , 1.3]), array([ 6.5, 2.8, 4.6, 1.5]), array([ 5.7, 2.8, 4.5, 1.3]), array([ 6.3, 3.3, 4.7, 1.6]), array([ 4.9, 2.4, 3.3, 1. ]), array([ 5.2, 2.7, 3.9, 1.4]), array([ 5. , 2. , 3.5, 1. ]), array([ 5.9, 3. , 4.2, 1.5]), array([ 6. , 2.2, 4. , 1. ]), array([ 6.1, 2.9, 4.7, 1.4]), array([ 5.6, 2.9, 3.6, 1.3]), array([ 6.7, 3.1, 4.4, 1.4]), array([ 5.8, 2.7, 4.1, 1. ]), array([ 6.2, 2.2, 4.5, 1.5]), array([ 5.6, 2.5, 3.9, 1.1]), array([ 6.1, 2.8, 4. , 1.3]), array([ 6.1, 2.8, 4.7, 1.2]), array([ 6.4, 2.9, 4.3, 1.3]), array([ 6.8, 2.8, 4.8, 1.4]), array([ 6.7, 3. , 5. , 1.7]), array([ 5.7, 2.6, 3.5, 1. ]), array([ 5.5, 2.4, 3.8, 1.1]), array([ 5.5, 2.4, 3.7, 1. ]), array([ 6. , 2.7, 5.1, 1.6]), array([ 6. , 3.4, 4.5, 1.6]), array([ 6.7, 3.1, 4.7, 1.5]), array([ 6.3, 2.3, 4.4, 1.3]), array([ 5.5, 2.5, 4. , 1.3]), array([ 5.5, 2.6, 4.4, 1.2]), array([ 5.6, 2.7, 4.2, 1.3]), array([ 5.7, 3. , 4.2, 1.2]), array([ 5.7, 2.9, 4.2, 1.3]), array([ 6.2, 2.9, 4.3, 1.3]), array([ 5.1, 2.5, 3. , 1.1]), array([ 5.7, 2.8, 4.1, 1.3]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 6.3, 2.9, 5.6, 1.8]), array([ 7.6, 3. , 6.6, 2.1]), array([ 6.7, 2.5, 5.8, 1.8]), array([ 7.2, 3.6, 6.1, 2.5]), array([ 6.4, 2.7, 5.3, 1.9]), array([ 6.8, 3. , 5.5, 2.1]), array([ 5.8, 2.8, 5.1, 2.4]), array([ 6.5, 3. , 5.5, 1.8]), array([ 6. , 2.2, 5. , 1.5]), array([ 6.9, 3.2, 5.7, 2.3]), array([ 5.6, 2.8, 4.9, 2. ]), array([ 7.7, 2.8, 6.7, 2. ]), array([ 6.3, 2.7, 4.9, 1.8]), array([ 7.2, 3.2, 6. , 1.8]), array([ 6.2, 2.8, 4.8, 1.8]), array([ 6.4, 2.8, 5.6, 2.1]), array([ 7.2, 3. , 5.8, 1.6]), array([ 7.4, 2.8, 6.1, 1.9]), array([ 6.4, 2.8, 5.6, 2.2]), array([ 6.3, 2.8, 5.1, 1.5]), array([ 6.1, 2.6, 5.6, 1.4]), array([ 6.3, 3.4, 5.6, 2.4]), array([ 6.9, 3.1, 5.4, 2.1]), array([ 6.9, 3.1, 5.1, 2.3]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 6.7, 3. , 5.2, 2.3]), array([ 6.3, 2.5, 5. , 1.9]), array([ 6.5, 3. , 5.2, 2. ]), array([ 6.2, 3.4, 5.4, 2.3]), array([ 5.9, 3. , 5.1, 1.8])]\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n", "\n", "[array([ 4.6, 3.1, 1.5, 0.2]), array([ 5. , 3.6, 1.4, 0.2]), array([ 4.6, 3.4, 1.4, 0.3]), array([ 5. , 3.4, 1.5, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 5.4, 3.7, 1.5, 0.2]), array([ 5. , 3.4, 1.6, 0.4]), array([ 5.2, 3.4, 1.4, 0.2]), array([ 4.8, 3.1, 1.6, 0.2]), array([ 5.2, 4.1, 1.5, 0.1]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 5. , 3.5, 1.3, 0.3]), array([ 4.4, 3.2, 1.3, 0.2]), array([ 5. , 3.5, 1.6, 0.6]), array([ 5.1, 3.8, 1.9, 0.4]), array([ 4.8, 3. , 1.4, 0.3]), array([ 4.6, 3.2, 1.4, 0.2]), array([ 5. , 3.3, 1.4, 0.2]), array([ 7. , 3.2, 4.7, 1.4]), array([ 6.9, 3.1, 4.9, 1.5]), array([ 6.6, 2.9, 4.6, 1.3]), array([ 5.6, 3. , 4.5, 1.5]), array([ 5.9, 3.2, 4.8, 1.8]), array([ 6.3, 2.5, 4.9, 1.5]), array([ 6.6, 3. , 4.4, 1.4]), array([ 6. , 2.9, 4.5, 1.5]), array([ 5.8, 2.7, 3.9, 1.2]), array([ 5.4, 3. , 4.5, 1.5]), array([ 5.6, 3. , 4.1, 1.3]), array([ 6.1, 3. , 4.6, 1.4]), array([ 5.8, 2.6, 4. , 1.2]), array([ 5. , 2.3, 3.3, 1. ]), array([ 6.3, 3.3, 6. , 2.5]), array([ 7.1, 3. , 5.9, 2.1]), array([ 6.5, 3. , 5.8, 2.2]), array([ 4.9, 2.5, 4.5, 1.7]), array([ 7.3, 2.9, 6.3, 1.8]), array([ 6.5, 3.2, 5.1, 2. ]), array([ 5.7, 2.5, 5. , 2. ]), array([ 6.4, 3.2, 5.3, 2.3]), array([ 7.7, 3.8, 6.7, 2.2]), array([ 7.7, 2.6, 6.9, 2.3]), array([ 6.7, 3.3, 5.7, 2.1]), array([ 6.1, 3. , 4.9, 1.8]), array([ 7.9, 3.8, 6.4, 2. ]), array([ 7.7, 3. , 6.1, 2.3]), array([ 6.4, 3.1, 5.5, 1.8]), array([ 6. , 3. , 4.8, 1.8]), array([ 6.7, 3.1, 5.6, 2.4]), array([ 6.8, 3.2, 5.9, 2.3]), array([ 6.7, 3.3, 5.7, 2.5])]\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n" ] } ], "source": [ "import random\n", "\n", "def splitDataset2(split, data, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):\n", " for i in range(len(data)):\n", " if random.random() < split:\n", " training_feature_set.append(iris.data[i])\n", " training_target_set.append(iris.target[i])\n", " else:\n", " test_feature_set.append(iris.data[i])\n", " test_target_set.append(iris.target[i])\n", " return training_feature_set, training_target_set, test_feature_set, test_target_set\n", "\n", "split = 0.66\n", "training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset2(split, iris.data)\n", "print 'Train: ' + str(len(training_feature_set)) + \" - ratio: \" + str(float(len(training_feature_set))/len(df))\n", "print 'Test: ' + str(len(test_feature_set)) + \" - ratio: \" + str(float(len(test_feature_set))/len(df))\n", "print\n", "print training_feature_set\n", "print training_target_set\n", "print\n", "print test_feature_set\n", "print test_target_set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- knn.fit(훈련 속성 데이터 집합, 훈련 분류 데이터 집합)을 통하여 knn 알고리즘 훈련 및 모델 형성\n", "- knn.predict(테스트 속성 데이터)를 통해 테스트 데이터의 분류 결과를 얻어옴" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classified:Iris-setosa, Actual:Iris-setosa\n" ] } ], "source": [ "k = 3\n", "knn = neighbors.KNeighborsClassifier(k)\n", "knn.fit(training_feature_set, training_target_set)\n", "result_index = knn.predict(test_feature_set[0])\n", "print('Classified:' + iris_names[result_index] + ', Actual:' + iris_names[test_target_set[0]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 전체 코드" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Accuracy: 97.1169784284%\n" ] } ], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn import neighbors, datasets\n", "\n", "iris = datasets.load_iris()\n", " \n", "def splitDataset2(split, data, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):\n", " for i in range(len(data)):\n", " if random.random() < split:\n", " training_feature_set.append(iris.data[i])\n", " training_target_set.append(iris.target[i])\n", " else:\n", " test_feature_set.append(iris.data[i])\n", " test_target_set.append(iris.target[i])\n", " return training_feature_set, training_target_set, test_feature_set, test_target_set\n", "\n", "if __name__ == '__main__':\n", " feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')\n", " iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')\n", " all_names = feature_names + ('class',)\n", " num_feature = len(feature_names)\n", " split = 0.66\n", " k = 3\n", " num_trials = 3\n", " accuracy_sum = 0.0\n", "\n", " for i in range(num_trials):\n", " training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset2(split, iris.data)\n", " knn = neighbors.KNeighborsClassifier(k)\n", " knn.fit(training_feature_set, training_target_set)\n", " \n", " classified_class_names=[]\n", " for i in range(len(test_feature_set)):\n", " result_index = knn.predict(test_feature_set[i])\n", " classified_class_names.append(iris_names[result_index])\n", " \n", " correct = 0.0\n", " for i in range(len(test_feature_set)):\n", " if iris_names[test_target_set[i]] == classified_class_names[i]:\n", " correct += 1.0\n", "\n", " accuracy_sum += (correct / float(len(test_feature_set))) * 100.0\n", "\n", " print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. Refererence" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/\n", "- http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" ] } ], "metadata": { "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }