In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Classification ##

In [None]:
def distance(pt1, pt2):
 """Return the distance between two points, represented as arrays"""
 return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
 """Return the distance between two numerical rows of a table"""
 return distance(np.array([row1])[0], np.array([row2])[0])

def distances(training, example):
 """Compute distance between example and every row in training.
 Return training augmented with Distance column"""
 distances = make_array()
 attributes = training.drop('Class')
 for row in attributes.rows:
 distances = np.append(distances, row_distance(row, example))
 return training.with_column('Distance', distances)

def closest(training, example, k):
 """Return a table of the k closest neighbors to example"""
 return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(topk):
 """Return the class with the highest count"""
 return topk.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
 "Return the majority class among the k nearest neighbors of example"
 return majority_class(closest(training, example, k))

In [None]:
#Data: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
#Class: Default payment (1=yes, 0=no)
#LIMIT_BAL: Amount of given credit in NT dollars (i.e., New Taiwan dollar)-includes individual and family/supplementary credit
# Scale:(0=pay duly, 1=payment delay for one month, 
### 2=payment delay for two months, ...
### 8=payment delay for eight months, 
### 9=payment delay for nine months and above)
# PAY_0: Repayment status in September, 2005 
# PAY_2: Repayment status in August, 2005 (scale same as above)
# PAY_3: Repayment status in July, 2005 (scale same as above)
# PAY_4: Repayment status in June, 2005 (scale same as above)
# PAY_5: Repayment status in May, 2005 (scale same as above)
# PAY_6: Repayment status in April, 2005 (scale same as above)
credit = Table.read_table('credit.csv')
credit.show(10)

In [None]:
credit_payments = credit.drop('LIMIT_BAL')
credit_payments

In [None]:
example123 = credit_payments.drop('Class').row(123)
example123

In [None]:
classify(credit_payments.exclude(123), example123, 5)

In [None]:
credit_payments.row(123)

## Evaluation ##

In [None]:
credit_payments.num_rows

In [None]:
training_set = credit_payments.take(np.arange(500))
test_set = credit_payments.take(np.arange(500, 1000))

In [None]:
print(training_set.num_rows)
print(test_set.num_rows)

In [None]:
def evaluate_accuracy(training, test, k):
 """Return the proportion of correctly classified examples 
 in the test set"""
 test_attributes = test.drop('Class')
 num_correct = 0
 for i in np.arange(test.num_rows):
 c = classify(training, test_attributes.row(i), k)
 num_correct = num_correct + (c == test.column('Class').item(i))
 return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
credit_payments.column('Class')

In [None]:
training_set.group('Class')

In [None]:
test_set.group('Class')

In [None]:
shuffled = credit_payments.sample(with_replacement=False)
training_set = shuffled.take(np.arange(500))
test_set = shuffled.take(np.arange(500, 1000))

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
evaluate_accuracy(training_set, test_set, 5)

In [None]:
evaluate_accuracy(training_set, training_set, 1)

In [None]:
training_example = training_set.drop("Class").row(0)
new_training = training_set.exclude(0)
training_example

In [None]:
distances(new_training, np.array([training_example])[0]).sort("Distance")