{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "import matplotlib\n", "from mpl_toolkits.mplot3d import Axes3D\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def distance(pt1, pt2):\n", " \"\"\"Return the distance between two points, represented as arrays\"\"\"\n", " return np.sqrt(sum((pt1 - pt2)**2))\n", "\n", "def row_distance(row1, row2):\n", " \"\"\"Return the distance between two numerical rows of a table\"\"\"\n", " return distance(np.array([row1])[0], np.array([row2])[0])\n", "\n", "def distances(training, example):\n", " \"\"\"Compute distance between example and every row in training.\n", " Return training augmented with Distance column\"\"\"\n", " distances = make_array()\n", " attributes = training.drop('Class')\n", " for row in attributes.rows:\n", " distances = np.append(distances, row_distance(row, example))\n", " return training.with_column('Distance', distances)\n", "\n", "def closest(training, example, k):\n", " \"\"\"Return a table of the k closest neighbors to example\"\"\"\n", " return distances(training, example).sort('Distance').take(np.arange(k))\n", "\n", "def majority_class(topk):\n", " \"\"\"Return the class with the highest count\"\"\"\n", " return topk.group('Class').sort('count', descending=True).column(0).item(0)\n", "\n", "def classify(training, example, k):\n", " \"Return the majority class among the k nearest neighbors of example\"\n", " return majority_class(closest(training, example, k))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Data: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset\n", "#Class: Default payment (1=yes, 0=no)\n", "#LIMIT_BAL: Amount of given credit in NT dollars (i.e., New Taiwan dollar)-includes individual and family/supplementary credit\n", "# Scale:(0=pay duly, 1=payment delay for one month, \n", "### 2=payment delay for two months, ...\n", "### 8=payment delay for eight months, \n", "### 9=payment delay for nine months and above)\n", "# PAY_0: Repayment status in September, 2005 \n", "# PAY_2: Repayment status in August, 2005 (scale same as above)\n", "# PAY_3: Repayment status in July, 2005 (scale same as above)\n", "# PAY_4: Repayment status in June, 2005 (scale same as above)\n", "# PAY_5: Repayment status in May, 2005 (scale same as above)\n", "# PAY_6: Repayment status in April, 2005 (scale same as above)\n", "credit = Table.read_table('credit.csv')\n", "credit.show(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "credit_payments = credit.drop('LIMIT_BAL')\n", "credit_payments" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "example123 = credit_payments.drop('Class').row(123)\n", "example123" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classify(credit_payments.exclude(123), example123, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "credit_payments.row(123)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "credit_payments.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_set = credit_payments.take(np.arange(500))\n", "test_set = credit_payments.take(np.arange(500, 1000))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(training_set.num_rows)\n", "print(test_set.num_rows)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def evaluate_accuracy(training, test, k):\n", " \"\"\"Return the proportion of correctly classified examples \n", " in the test set\"\"\"\n", " test_attributes = test.drop('Class')\n", " num_correct = 0\n", " for i in np.arange(test.num_rows):\n", " c = classify(training, test_attributes.row(i), k)\n", " num_correct = num_correct + (c == test.column('Class').item(i))\n", " return num_correct / test.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evaluate_accuracy(training_set, test_set, 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "credit_payments.column('Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_set.group('Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_set.group('Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shuffled = credit_payments.sample(with_replacement=False)\n", "training_set = shuffled.take(np.arange(500))\n", "test_set = shuffled.take(np.arange(500, 1000))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evaluate_accuracy(training_set, test_set, 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evaluate_accuracy(training_set, test_set, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evaluate_accuracy(training_set, training_set, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_example = training_set.drop(\"Class\").row(0)\n", "new_training = training_set.exclude(0)\n", "training_example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "distances(new_training, np.array([training_example])[0]).sort(\"Distance\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }