{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "import matplotlib\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Can you tell if a bank note is counterfeit or legitimate?\n",
    "# Variables based on photgraphs of many banknotes (a few numbers for each image calculated)\n",
    "banknotes = Table.read_table('banknote.csv')\n",
    "banknotes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize 'WaveletVar' and 'WaveletCurt'\n",
    "banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize 'WaveletSkew', 'Entropy'\n",
    "banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Two attributes have some overlap of classes...what happens with three attributes?\n",
    "fig = plots.figure(figsize=(8,8))\n",
    "ax = Axes3D(fig)\n",
    "ax.scatter(banknotes.column('WaveletSkew'), \n",
    "           banknotes.column('WaveletVar'), \n",
    "           banknotes.column('WaveletCurt'), \n",
    "           c=banknotes.column('Class'),\n",
    "           cmap='viridis',\n",
    "          s=50);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Breast cancer classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Class 1 = malignant (cancer)\n",
    "#Class 0 = benign (not cancer)\n",
    "patients = Table.read_table('breast-cancer.csv').drop('ID')\n",
    "patients.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#A number of points are layered\n",
    "patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Function to \"jitter\" the points (for visualization purposes)\n",
    "def randomize_column(a):\n",
    "    return a + np.random.normal(0.0, 0.09, size=len(a))\n",
    "\n",
    "jittered = Table().with_columns([\n",
    "        'Bland Chromatin (jittered)', \n",
    "        randomize_column(patients.column('Bland Chromatin')),\n",
    "        'Single Epithelial Cell Size (jittered)', \n",
    "        randomize_column(patients.column('Single Epithelial Cell Size')),\n",
    "        'Class',\n",
    "        patients.column('Class')\n",
    "    ])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the data with the points jittered\n",
    "jittered.scatter(0, 1, colors='Class')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get only features of the data (i.e. the attributes) without the class labels\n",
    "features = patients.drop('Class')\n",
    "features.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a function to compute the distance between two arrays\n",
    "def distance(pt1, pt2):\n",
    "    \"\"\"Return the distance between two points, represented as arrays\"\"\"\n",
    "    return np.sqrt(sum((pt1 - pt2)**2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can use np.array(tuple(row)) convert a row to an numpy array\n",
    "row_one_array = np.array(tuple(features.row(1)))\n",
    "row_one_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a function to compute the distance between two rows in a Table\n",
    "def row_distance(row1, row2):\n",
    "    \"\"\"Return the distance between two numerical rows of a table\"\"\"\n",
    "    return distance(np.array(tuple(row1)), np.array(tuple(row2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# distance between the first and second row\n",
    "row_distance(features.row(0), features.row(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sanity check: distance between first row and itself\n",
    "row_distance(features.row(0), features.row(0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification Procedure ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a function to compute the distance between a whole training set and a given example\n",
    "# returns the training set with an additional column that has the distance to the example for each row\n",
    "def distances(training, example):\n",
    "    \"\"\"Compute distance between example and every row in training.\n",
    "    Return training augmented with Distance column\"\"\"\n",
    "    distances = make_array()\n",
    "    attributes = training.drop('Class')\n",
    "    for row in attributes.rows:\n",
    "        distances = np.append(distances, row_distance(row, example))\n",
    "    return training.with_column('Distance', distances)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's look at patient 15\n",
    "patients.take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's look at the features for row 15\n",
    "example = features.row(15)\n",
    "example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's look at the distance between patient 15 and all other patients\n",
    "distances(patients.exclude(15), example).sort('Distance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function that will return a table with the k closest distances to an example\n",
    "def closest(training, example, k):\n",
    "    \"\"\"Return a table of the k closest neighbors to example\"\"\"\n",
    "    return distances(training, example).sort('Distance').take(np.arange(k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Applying the closest function to example patient 15\n",
    "closest_table = closest(patients.exclude(15), example, 5)\n",
    "closest_table "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function that returns the class label for the class that has the most nearest neighbors\n",
    "def majority_class(topk):\n",
    "    \"\"\"Return the class with the highest count\"\"\"\n",
    "    return topk.group('Class').sort('count', descending=True).column(0).item(0)\n",
    "\n",
    "majority_class(closest_table) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The full k nearest neighbor classification function\n",
    "def classify(training, example, k):\n",
    "    \"Return the majority class among the k nearest neighbors of example\"\n",
    "    return majority_class(closest(training, example, k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# applying the kNN function to patient 15\n",
    "classify(patients.exclude(15), example, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's look at patient 15 - did we make the correct prediction? \n",
    "patients.take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try it for patient 10\n",
    "new_example = features.row(10)\n",
    "classify(patients.exclude(10), new_example, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Did we get it correct?\n",
    "patients.take(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate any patient number...\n",
    "patient_to_use = 6\n",
    "new_example = features.row(patient_to_use)\n",
    "classify(patients.exclude(patient_to_use), new_example, 5), patients.take(patient_to_use).column(\"Class\").item(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show the total number of rows in the data set\n",
    "patients.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a training and test set\n",
    "shuffled = patients.sample(with_replacement=False) # Randomly permute the rows\n",
    "training_set = shuffled.take(np.arange(342))\n",
    "test_set  = shuffled.take(np.arange(342, 683))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print the number of points in the training and test set\n",
    "print(training_set.num_rows)\n",
    "print(test_set.num_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a function that returns the proportion of points correctly classified in the test set\n",
    "def evaluate_accuracy(training, test, k):\n",
    "    \"\"\"Return the proportion of correctly classified examples \n",
    "    in the test set\"\"\"\n",
    "    test_attributes = test.drop('Class')\n",
    "    num_correct = 0\n",
    "    for i in np.arange(test.num_rows):\n",
    "        c = classify(training, test_attributes.row(i), k)\n",
    "        num_correct = num_correct + (c == test.column('Class').item(i))\n",
    "    return num_correct / test.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the classifier using k = 5\n",
    "evaluate_accuracy(training_set, test_set, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the classifier using k = 3\n",
    "evaluate_accuracy(training_set, test_set, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the classifier using k = 11\n",
    "evaluate_accuracy(training_set, test_set, 11)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the classifier using k = 1 both training and testing using only the training set\n",
    "evaluate_accuracy(training_set, training_set, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the classifier using k = 1\n",
    "evaluate_accuracy(training_set, test_set, 1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}