# # Classify Template # # Finish the code for the method, nearestNeighbor # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # class Classifier: def __init__(self, filename): self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() f.close() self.format = lines[0].strip().split('\t') self.data = [] for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(int(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) ################################################## ### ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data""" # first extract values to list col = [v[1][columnNumber] for v in self.data] median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) #print("Median: %f ASD = %f" % (median, asd)) self.medianAndDeviation.append((median, asd)) for v in self.data: v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. We now use them to normalize vector v""" vector = list(v) for i in range(len(vector)): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector ### ### END NORMALIZATION ################################################## def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return ((0, ("REPLACE THIS LINE WITH CORRECT RETURN", [0], []))) def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) def unitTest(): classifier = Classifier('athletesTrainingSet.txt') br = ('Basketball', [72, 162], ['Brittainey Raven']) nl = ('Gymnastics', [61, 76], ['Viktoria Komova']) cl = ("Basketball", [74, 190], ['Crystal Langhorne']) # first check normalize function brNorm = classifier.normalizeVector(br[1]) nlNorm = classifier.normalizeVector(nl[1]) clNorm = classifier.normalizeVector(cl[1]) assert(brNorm == classifier.data[1][1]) assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) assert(result[1][2]== br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) assert(result[1][2]== nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") # Check if classify correctly identifies sports assert(classifier.classify(br[1]) == 'Basketball') assert(classifier.classify(cl[1]) == 'Basketball') assert(classifier.classify(nl[1]) == 'Gymnastics') print('Classify fn OK') unitTest()