# # Naive Bayes Classifier chapter 6 # # _____________________________________________________________________ import math class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): """ a classifier will be built from files with the bucketPrefix excluding the file with textBucketNumber. dataFormat is a string that describes how to interpret each line of the data files. For example, for the iHealth data the format is: "attr attr attr attr class" """ total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric # we will use these to compute the mean and sample standard deviation for # each attribute - class pair. totals = {} numericValues = {} # reading the data in from the file self.format = dataFormat.strip().split('\t') # self.prior = {} self.conditional = {} # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data if i != testBucketNumber: filename = "%s-%02i" % (bucketPrefix, i) f = open(filename) lines = f.readlines() f.close() for line in lines: fields = line.strip().split('\t') ignore = [] vector = [] nums = [] for i in range(len(fields)): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': category = fields[i] # now process this instance total += 1 classes.setdefault(category, 0) counts.setdefault(category, {}) totals.setdefault(category, {}) numericValues.setdefault(category, {}) classes[category] += 1 # now process each non-numeric attribute of the instance col = 0 for columnValue in vector: col += 1 counts[category].setdefault(col, {}) counts[category][col].setdefault(columnValue, 0) counts[category][col][columnValue] += 1 # process numeric attributes col = 0 for columnValue in nums: col += 1 totals[category].setdefault(col, 0) #totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) # # ok done counting. now compute probabilities # # first prior probabilities p(h) # for (category, count) in classes.items(): self.prior[category] = count / total # # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): self.conditional.setdefault(category, {}) for (col, valueCounts) in columns.items(): self.conditional[category].setdefault(col, {}) for (attrValue, count) in valueCounts.items(): self.conditional[category][col][attrValue] = ( count / classes[category]) self.tmp = counts # # now compute mean and sample standard deviation # self.means = {} self.totals = totals for (category, columns) in totals.items(): self.means.setdefault(category, {}) for (col, cTotal) in columns.items(): self.means[category][col] = cTotal / classes[category] # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 theMean = self.means[category][col] for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() totals = {} f.close() loc = 1 for line in lines: loc += 1 data = line.strip().split('\t') vector = [] numV = [] classInColumn = -1 for i in range(len(self.format)): if self.format[i] == 'num': numV.append(float(data[i])) elif self.format[i] == 'attr': vector.append(data[i]) elif self.format[i] == 'class': classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] sqrt2pi = math.sqrt(2 * math.pi) for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart) col += 1 results.append((prob, category)) # return the category with the highest probability #print(results) return(max(results)[1]) def tenfold(bucketPrefix, dataFormat): results = {} for i in range(1, 11): c = Classifier(bucketPrefix, i, dataFormat) t = c.testBucket(bucketPrefix, i) for (key, value) in t.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue # now print results categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %5i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): """Probability Density Function computing P(x|y) input is the mean, sample standard deviation for all the items in y, and x.""" ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) print (ePart) return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart #tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #c = Classifier("house-votes/hv", 0, # "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") tenfold("pima/pima", "num num num num num num num num class") #c = Classifier("iHealth/i", 10, # "attr\tattr\tattr\tattr\tclass") #print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) #c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #t = c.testBucket("house-votes-filtered/hv", 5) #print(t)