{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.179814Z", "start_time": "2020-02-16T23:07:56.458785Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import StratifiedShuffleSplit\n", "import operator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.195684Z", "start_time": "2020-02-16T23:07:58.185150Z" } }, "outputs": [], "source": [ "def getDataset(loc):\n", " columns = ['sepal_length', 'sepal_width','petal_length','petal_width', 'class']\n", " data = pd.read_csv(loc, header=None, names=columns)\n", " return data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.447880Z", "start_time": "2020-02-16T23:07:58.201359Z" } }, "outputs": [], "source": [ "def splitDataset(dataset, ratio):\n", " split = StratifiedShuffleSplit(n_splits=1, test_size=ratio, random_state=42)\n", " \n", " for train_index, test_index in split.split(dataset, dataset['class']):\n", " train_data = dataset.loc[train_index]\n", " test_data = dataset.loc[test_index]\n", " \n", " \n", " return train_data, test_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.630056Z", "start_time": "2020-02-16T23:07:58.450499Z" } }, "outputs": [], "source": [ "def euclideanDistance(instance1, instance2):\n", " instance1 = np.array(instance1)\n", " instance2 = np.array(instance2)\n", " distance = np.sum(np.power(instance1 - instance2, 2))\n", " return np.sqrt(distance)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.768916Z", "start_time": "2020-02-16T23:07:58.632827Z" } }, "outputs": [], "source": [ "# using pandas indexing methods\n", "\n", "def getKNeighbors(training_set, test_instance, k):\n", " distances = []\n", " classes = training_set['class'].unique()\n", " for row in range(len(training_set)):\n", " dist = euclideanDistance(training_set.iloc[row].values[:-1], test_instance[:-1])\n", " \n", " distances.append((training_set.iloc[row]['class'], dist))\n", "\n", " distances = sorted(distances, key=operator.itemgetter(1))[:k]\n", " inv_class_freq = {x:0 for x in classes}\n", " \n", " for cls, dist in distances:\n", " inv_class_freq[cls] += (1 / dist)\n", "\n", " return inv_class_freq" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:58.941440Z", "start_time": "2020-02-16T23:07:58.771905Z" } }, "outputs": [], "source": [ "# faster cause turning the data into a list of lists\n", "# as the dataset is smaller in size\n", "\n", "def getKNeighbors(training_set, test_instance, k):\n", " distances = []\n", " classes = training_set['class'].unique()\n", " \n", " training_set = training_set.values\n", " \n", " for row in range(len(training_set)):\n", " dist = euclideanDistance(training_set[row][:-1], test_instance[:-1])\n", " \n", " distances.append((training_set[row][-1], dist))\n", " distances = sorted(distances, key=operator.itemgetter(1))[:k]\n", " inv_class_freq = {x:0 for x in classes}\n", " \n", " for cls, dist in distances:\n", " inv_class_freq[cls] += (1 / dist)\n", "\n", " return inv_class_freq" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:59.117381Z", "start_time": "2020-02-16T23:07:58.951616Z" } }, "outputs": [], "source": [ "def getResponse(inv_freq):\n", " predicted_class = max(inv_freq, key=inv_freq.get)\n", " return predicted_class" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:07:59.266774Z", "start_time": "2020-02-16T23:07:59.123520Z" } }, "outputs": [], "source": [ "def getAccuracy(testSet, predictions):\n", " correct = 0\n", " for x in range(len(testSet)):\n", " if testSet.iloc[x]['class'] == predictions[x]:\n", " correct += 1\n", " return (correct/len(testSet)) * 100.0" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:08:05.002270Z", "start_time": "2020-02-16T23:07:59.271965Z" } }, "outputs": [], "source": [ "if __name__ == '__main__':\n", " # get data\n", " data = getDataset(r'../datasets/iris_data.csv')\n", " \n", " # split data into stratified subsets\n", " ratio = 0.2\n", " trainingSet, testSet = splitDataset(data, ratio)\n", "\n", " trainingSet = trainingSet.reset_index(drop=True)\n", " testSet = testSet.reset_index(drop=True)\n", " accuracy_scores = []\n", " # generate predictions\n", " for k in range(1, 31):\n", " predictions = []\n", " for row in range(len(testSet)):\n", " inv_freq = getKNeighbors(trainingSet, testSet.iloc[row].values, k)\n", " result = getResponse(inv_freq)\n", " predictions.append(result)\n", "\n", " accuracy = getAccuracy(testSet, predictions)\n", " accuracy_scores.append(accuracy)\n", "# print(f'k: {k}, Accuracy: {round(accuracy,3)}%')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-02-16T23:08:05.817093Z", "start_time": "2020-02-16T23:08:05.006180Z" } }, "outputs": [ { "data": { "text/plain": [ "<matplotlib.legend.Legend at 0x2503e08f7c8>" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 864x720 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(12, 10))\n", "plt.plot(list(range(1, 31)), accuracy_scores)\n", "plt.scatter(list(range(1, 31)), accuracy_scores, label=\"Accuracy\")\n", "plt.title('K vs Accuracy')\n", "plt.xticks(ticks=range(1, 31))\n", "plt.xlabel('K')\n", "plt.ylabel('Accuracy', rotation=0)\n", "plt.legend()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }