import random
import math
import numpy
from sklearn import tree, svm, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

def main(task):
	if task == 1 :
		print("Bagging + SVM")
	elif task == 2 :
		print("Bagging + DTree")
	elif task == 3 :
		print("Adaboost + SVM")
	elif task == 4 :
		print("Adaboost + DTree")
	elif task == 5 :
		print("Bagging + Gaussian Naive Bayes")
	elif task == 6 :
		print("Adaboost + Gaussian Naive Bayes")

	dataList = []
	attributesList = []
	X = []
	y = []

	#read data
	#sourceData = open("ContentNewLinkAllSample.csv",'r')
	sourceData = open("ex1.csv",'r') # only transformed link features
	#sourceData = open("ex2.csv",'r') # only content features
	
	#read label (first line of the input file)
	line = sourceData.readline()
	line = line.strip('\n')					#remove '\n' at the end of line
	lineSplit = line.split(',')				#slice the line to seperate attributes  
	attributesList.append(lineSplit)			#get all attributes
	
	#read data (rest of the input file)
	line = sourceData.readline()
	while line:
		line = line.strip('.\n')			#remove '.\n' at the end of line
		lineSplit = line.split(',')			#slice the line to seperate attributes
		dataList.append( lineSplit ) 	#add to datalist
		line = sourceData.readline()		#read next line

	sourceData.close()

	#covert data from string to float (except the class) and seperate to X & y
	for data in dataList :
		#print(data)
		if data[-1] == "spam\r" :
			y.append(0)
		else :
			y.append(1)
		X.append( [float(key) for key in data[:-1]] )

	#print(y)
	#normalize
	X = preprocessing.normalize(X)

	# print(attributesList)
	# print(dataList[0])

	#get training set & test set by resamlping
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	if task == 1 or task == 2 or task == 5: #1Bagging + SVM / #2Bagging + DTree
		bootstrapSize = 100
		result = []
		predict = []
		#start single training
		for i in range(bootstrapSize) :
			X_temp = []
			y_temp = []
			#get trainging set for bagging
			for j in range( len(X_train) ):
				k = random.randint(0, len(X_train) - 1)
				X_temp.append( X_train[k] )
				y_temp.append( y_train[k] )
			#set classifier
			if task == 1:
				clf = svm.LinearSVC(dual=False)
			if task == 2:
				clf = tree.DecisionTreeClassifier()
			if task == 5:
				clf = GaussianNB()
			clf.fit(X_temp , y_temp)
			#predict by single training
			result.append( clf.predict(X_test) )
		#predict by bagging
		for i in range(len(y_test)):
			count = 0
			for j in range(bootstrapSize):
				if result[j][i] ==1 :
					count += 1
			if count > bootstrapSize/2 :
				predict.append(1)
			else :
				predict.append(0)
		#calc the correct ratio
		correctRatio = 0.0
		for i in range( len(y_test) ) :
			if y_test[i] == predict[i] :
				correctRatio += 1
		correctRatio = correctRatio / len(X_test)
		print(correctRatio)

	if task == 3 or task == 4 or task == 6: #3AdaBoost + SVM #4AdaBoost + DTree
		bootstrapSize = 100
		result = []
		predict = []
		weight = [1.0/len(X_train) for x in X_train]
		beta = [0.0 for x in range(bootstrapSize)]

		#start single training
		for i in range(bootstrapSize) :
			#set classifier
			if task == 3:
				clf = svm.LinearSVC(dual=False)
			if task == 4:
				clf = tree.DecisionTreeClassifier()
			if task == 6:
				clf = GaussianNB()
				weight = numpy.array(weight)
			clf.fit(X_train , y_train , sample_weight = weight)
			#predict the training set
			result_temp = clf.predict(X_train)
			#update beta
			errorWeight = sum( [0.0 if result_temp[j]==y_train[j] else weight[j] for j in range( len(X_train) ) ] ) 
			beta[i] = errorWeight / (1 - errorWeight)
			#update weight
			weight = [ weight[j]*beta[i] if result_temp[j]==y_train[j] else weight[j] for j in range( len(X_train) ) ] 
			weightSum = sum(weight)
			weight = [ weight[j]/weightSum for j in range( len(X_train) ) ]
			#predict the test set by single training
			result_temp = clf.predict(X_test)
			result.append([math.log(1/beta[i]) if x == 1 else -math.log(1/beta[i]) for x in result_temp])

		#predict by Adaboost
		for i in range(len(y_test)):
			count = 0.0
			for j in range(bootstrapSize):
				count += result[j][i]
			if count > 0 :
				predict.append(1)
			else :
				predict.append(0)
		#calc the correct ratio
		correctRatio = 0.0
		for i in range( len(y_test) ) :
			if y_test[i] == predict[i] :
				correctRatio += 1
		correctRatio = correctRatio / len(X_test)
		print(correctRatio)

for i in range(1,7):
	main(i)
# main(6)
# main(1)
# main(2)
# main(3)
# main(4)