{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "import matplotlib\n", "from mpl_toolkits.mplot3d import Axes3D\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Can you tell if a bank note is counterfeit or legitimate?\n", "# Variables based on photgraphs of many banknotes (a few numbers for each image calculated)\n", "banknotes = Table.read_table('banknote.csv')\n", "banknotes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualize 'WaveletVar' and 'WaveletCurt'\n", "banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualize 'WaveletSkew', 'Entropy'\n", "banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Two attributes have some overlap of classes...what happens with three attributes?\n", "fig = plots.figure(figsize=(8,8))\n", "ax = Axes3D(fig)\n", "ax.scatter(banknotes.column('WaveletSkew'), \n", " banknotes.column('WaveletVar'), \n", " banknotes.column('WaveletCurt'), \n", " c=banknotes.column('Class'),\n", " cmap='viridis',\n", " s=50);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Breast cancer classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Class 1 = malignant (cancer)\n", "#Class 0 = benign (not cancer)\n", "patients = Table.read_table('breast-cancer.csv').drop('ID')\n", "patients.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#A number of points are layered\n", "patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Function to \"jitter\" the points (for visualization purposes)\n", "def randomize_column(a):\n", " return a + np.random.normal(0.0, 0.09, size=len(a))\n", "\n", "jittered = Table().with_columns([\n", " 'Bland Chromatin (jittered)', \n", " randomize_column(patients.column('Bland Chromatin')),\n", " 'Single Epithelial Cell Size (jittered)', \n", " randomize_column(patients.column('Single Epithelial Cell Size')),\n", " 'Class',\n", " patients.column('Class')\n", " ])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot the data with the points jittered\n", "jittered.scatter(0, 1, colors='Class')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get only features of the data (i.e. the attributes) without the class labels\n", "features = patients.drop('Class')\n", "features.show(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a function to compute the distance between two arrays\n", "def distance(pt1, pt2):\n", " \"\"\"Return the distance between two points, represented as arrays\"\"\"\n", " return np.sqrt(sum((pt1 - pt2)**2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# We can use np.array(tuple(row)) convert a row to an numpy array\n", "row_one_array = np.array(tuple(features.row(1)))\n", "row_one_array" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a function to compute the distance between two rows in a Table\n", "def row_distance(row1, row2):\n", " \"\"\"Return the distance between two numerical rows of a table\"\"\"\n", " return distance(np.array(tuple(row1)), np.array(tuple(row2)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# distance between the first and second row\n", "row_distance(features.row(0), features.row(1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sanity check: distance between first row and itself\n", "row_distance(features.row(0), features.row(0))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification Procedure ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# a function to compute the distance between a whole training set and a given example\n", "# returns the training set with an additional column that has the distance to the example for each row\n", "def distances(training, example):\n", " \"\"\"Compute distance between example and every row in training.\n", " Return training augmented with Distance column\"\"\"\n", " distances = make_array()\n", " attributes = training.drop('Class')\n", " for row in attributes.rows:\n", " distances = np.append(distances, row_distance(row, example))\n", " return training.with_column('Distance', distances)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's look at patient 15\n", "patients.take(15)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's look at the features for row 15\n", "example = features.row(15)\n", "example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's look at the distance between patient 15 and all other patients\n", "distances(patients.exclude(15), example).sort('Distance')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# A function that will return a table with the k closest distances to an example\n", "def closest(training, example, k):\n", " \"\"\"Return a table of the k closest neighbors to example\"\"\"\n", " return distances(training, example).sort('Distance').take(np.arange(k))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Applying the closest function to example patient 15\n", "closest_table = closest(patients.exclude(15), example, 5)\n", "closest_table " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# A function that returns the class label for the class that has the most nearest neighbors\n", "def majority_class(topk):\n", " \"\"\"Return the class with the highest count\"\"\"\n", " return topk.group('Class').sort('count', descending=True).column(0).item(0)\n", "\n", "majority_class(closest_table) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The full k nearest neighbor classification function\n", "def classify(training, example, k):\n", " \"Return the majority class among the k nearest neighbors of example\"\n", " return majority_class(closest(training, example, k))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# applying the kNN function to patient 15\n", "classify(patients.exclude(15), example, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's look at patient 15 - did we make the correct prediction? \n", "patients.take(15)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's try it for patient 10\n", "new_example = features.row(10)\n", "classify(patients.exclude(10), new_example, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Did we get it correct?\n", "patients.take(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate any patient number...\n", "patient_to_use = 6\n", "new_example = features.row(patient_to_use)\n", "classify(patients.exclude(patient_to_use), new_example, 5), patients.take(patient_to_use).column(\"Class\").item(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show the total number of rows in the data set\n", "patients.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a training and test set\n", "shuffled = patients.sample(with_replacement=False) # Randomly permute the rows\n", "training_set = shuffled.take(np.arange(342))\n", "test_set = shuffled.take(np.arange(342, 683))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# print the number of points in the training and test set\n", "print(training_set.num_rows)\n", "print(test_set.num_rows)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a function that returns the proportion of points correctly classified in the test set\n", "def evaluate_accuracy(training, test, k):\n", " \"\"\"Return the proportion of correctly classified examples \n", " in the test set\"\"\"\n", " test_attributes = test.drop('Class')\n", " num_correct = 0\n", " for i in np.arange(test.num_rows):\n", " c = classify(training, test_attributes.row(i), k)\n", " num_correct = num_correct + (c == test.column('Class').item(i))\n", " return num_correct / test.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate the classifier using k = 5\n", "evaluate_accuracy(training_set, test_set, 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate the classifier using k = 3\n", "evaluate_accuracy(training_set, test_set, 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate the classifier using k = 11\n", "evaluate_accuracy(training_set, test_set, 11)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate the classifier using k = 1 both training and testing using only the training set\n", "evaluate_accuracy(training_set, training_set, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# evaluate the classifier using k = 1\n", "evaluate_accuracy(training_set, test_set, 1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }