{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "import matplotlib\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def distance(pt1, pt2):\n",
    "    \"\"\"Return the distance between two points, represented as arrays\"\"\"\n",
    "    return np.sqrt(sum((pt1 - pt2)**2))\n",
    "\n",
    "def row_distance(row1, row2):\n",
    "    \"\"\"Return the distance between two numerical rows of a table\"\"\"\n",
    "    return distance(np.array([row1])[0], np.array([row2])[0])\n",
    "\n",
    "def distances(training, example):\n",
    "    \"\"\"Compute distance between example and every row in training.\n",
    "    Return training augmented with Distance column\"\"\"\n",
    "    distances = make_array()\n",
    "    attributes = training.drop('Class')\n",
    "    for row in attributes.rows:\n",
    "        distances = np.append(distances, row_distance(row, example))\n",
    "    return training.with_column('Distance', distances)\n",
    "\n",
    "def closest(training, example, k):\n",
    "    \"\"\"Return a table of the k closest neighbors to example\"\"\"\n",
    "    return distances(training, example).sort('Distance').take(np.arange(k))\n",
    "\n",
    "def majority_class(topk):\n",
    "    \"\"\"Return the class with the highest count\"\"\"\n",
    "    return topk.group('Class').sort('count', descending=True).column(0).item(0)\n",
    "\n",
    "def classify(training, example, k):\n",
    "    \"Return the majority class among the k nearest neighbors of example\"\n",
    "    return majority_class(closest(training, example, k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Data:  https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset\n",
    "#Class:  Default payment (1=yes, 0=no)\n",
    "#LIMIT_BAL: Amount of given credit in NT dollars (i.e., New Taiwan dollar)-includes individual and family/supplementary credit\n",
    "# Scale:(0=pay duly, 1=payment delay for one month, \n",
    "### 2=payment delay for two months, ...\n",
    "### 8=payment delay for eight months, \n",
    "### 9=payment delay for nine months and above)\n",
    "# PAY_0: Repayment status in September, 2005 \n",
    "# PAY_2: Repayment status in August, 2005 (scale same as above)\n",
    "# PAY_3: Repayment status in July, 2005 (scale same as above)\n",
    "# PAY_4: Repayment status in June, 2005 (scale same as above)\n",
    "# PAY_5: Repayment status in May, 2005 (scale same as above)\n",
    "# PAY_6: Repayment status in April, 2005 (scale same as above)\n",
    "credit = Table.read_table('credit.csv')\n",
    "credit.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_payments = credit.drop('LIMIT_BAL')\n",
    "credit_payments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "example123 = credit_payments.drop('Class').row(123)\n",
    "example123"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classify(credit_payments.exclude(123), example123, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_payments.row(123)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_payments.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_set = credit_payments.take(np.arange(500))\n",
    "test_set = credit_payments.take(np.arange(500, 1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(training_set.num_rows)\n",
    "print(test_set.num_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_accuracy(training, test, k):\n",
    "    \"\"\"Return the proportion of correctly classified examples \n",
    "    in the test set\"\"\"\n",
    "    test_attributes = test.drop('Class')\n",
    "    num_correct = 0\n",
    "    for i in np.arange(test.num_rows):\n",
    "        c = classify(training, test_attributes.row(i), k)\n",
    "        num_correct = num_correct + (c == test.column('Class').item(i))\n",
    "    return num_correct / test.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate_accuracy(training_set, test_set, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "credit_payments.column('Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_set.group('Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_set.group('Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shuffled = credit_payments.sample(with_replacement=False)\n",
    "training_set = shuffled.take(np.arange(500))\n",
    "test_set = shuffled.take(np.arange(500, 1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate_accuracy(training_set, test_set, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate_accuracy(training_set, test_set, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluate_accuracy(training_set, training_set, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_example = training_set.drop(\"Class\").row(0)\n",
    "new_training = training_set.exclude(0)\n",
    "training_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "distances(new_training, np.array([training_example])[0]).sort(\"Distance\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}