''' Random Forest Analysis This is a regressor random forest that aims to predict the charges (insurance) based on the variables in the dataset ''' from exploratory_analysis import data import pandas as pd import matplotlib.pyplot as plt import numpy as np from numpy import mean from numpy import std from numpy import arange from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedKFold from sklearn.ensemble import RandomForestRegressor dataX = data.iloc[:,:-1] dataY = data.iloc[:,6] ''' This part of the program was adapted from a regressor model on "Machine Learning Mastery" url: https://machinelearningmastery.com/random-forest-ensemble-in-python/ *** code adapted from the above url *** ''' def get_models(): models = dict() #exploting ratios from 10% to 100% for i in arange(0.1, 1.1, 0.1): key = "%.1f" % i #setting the max samples to none if i == 1.0: i = None models[key] = RandomForestRegressor(max_samples = i) return models def evaluate_model(model, x, y): #defining the evaluation procedure cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1) scores = cross_val_score(model, dataX, dataY, scoring = "neg_mean_absolute_error", cv = cv, n_jobs = 1, error_score = "raise") #scores = cross_val_score(model, dataX, dataY, scoring = "neg_mean_squared_error", cv = cv, n_jobs = 1, error_score = "raise") return np.absolute(scores) models = get_models() results, names = list(), list() for name, model in models.items(): #evaluate the model scores = evaluate_model(model, dataX, dataY) #storing the results results.append(scores) names.append(name) #summarizing the performance print("Mean MAE scores and STD", name, mean(scores), std(scores)) #print("RMSE scores and STD", name, mean(np.sqrt(scores))) #ans = np.sqrt(results) #converting the ans variable to a list in order to plot it with the names list - otherwise it won't run #ans = list(ans) plt.boxplot(results, labels = names, showmeans = True) plt.show()