# finding total number of numeric variables (including integers) nvars <- function(x){ A <- data.frame(numeric=0, factor=0, integer=0) temp <- table(sapply(x, class)) A[names(temp)] <- temp A["numeric"] <- A$numeric + A$integer A$integer <- NULL A$numeric <- as.numeric(A$numeric) return(A) } # Suppressing output from cat() by Hadeley Wickham per # https://r.789695.n4.nabble.com/Suppressing-output-e-g-from-cat-tp859876p859882.html quiet = function(x) { sink(tempfile()) on.exit(sink()) invisible(force(x)) } # Unzipping and reading from csv unzip_read = function(fileName, ext = ".dat"){ # read all file names in the subfolder path = paste0(here::here("Data/unzipDatasets"), "/" ) all_files = gsub(path, "", as.character(unzip(fileName, exdir=here::here("Data/unzipDatasets"))))[1] # find the .dat data file find_ext = str_detect(all_files, ext) # the following code needs to modify if there are more than one dataset in the folder if (length(find_ext[find_ext==TRUE])==1){ # find data and names for predictors filetext = readLines(unz(fileName, all_files[find_ext==TRUE])) unlink(unz(fileName, all_files[find_ext==TRUE])) lines2Skip = which(str_detect(filetext, "@data")) find_predictor_names = str_remove_all(filetext[str_detect(filetext, "@inputs|@input")], "@inputs |@input ") find_predictor_names = str_replace_all(find_predictor_names, " ", "") predictor_names = unlist(str_split(find_predictor_names, ",")) predictor_names = predictor_names[predictor_names != ""] # read the dataset df = read.csv(unz(fileName, all_files[find_ext==TRUE]), skip = lines2Skip, header=F, stringsAsFactors = T) colnames(df) = c(predictor_names, "Class") df %<>% mutate_if(is.factor, ~factor(trimws(.))) return(df) }else{ return(NA) } } # Generating a list of data.frames containing all keelDatasets keelData = function(kDataFullLinks, datasetNames, numAttributes, dest = here::here("Data/imbalancedDatasets")){ # Downloading the data into Map(function(u, d) download.file(u, d, mode="wb"), kDataFullLinks, paste0(dest,"/" , datasetNames, ".zip") ) subfolder_names = list.files(path = dest, pattern = ".zip", full.names = T) List_of_dfs = list() for (counter in 1:length(subfolder_names)) { List_of_dfs[[counter]] = unzip_read(subfolder_names[counter]) df_name = gsub(".zip", "", gsub(".*/", "", subfolder_names[counter])) names(List_of_dfs)[counter] = df_name } return(List_of_dfs) } # Creating a function for applying multiple machine learning models and # obtaining results from 5 fold cross validation mainfunction <- function(i, dataset, cases){ trainData = dataset$data[[cases$ID[i]]] best_metric <- cases$metrics[i] subsampling <- cases$subsampling[i] if (subsampling=="NULL") subsampling <- NULL # * Required Packages and Custom Functions -------------------------------- # * * Packages ------------------------------------------------------------ if(require(remotes)==FALSE) install.packages("remotes") if(require(DMwR)==FALSE) remotes::install_github("cran/DMwR") pacman::p_load(tidyverse, magrittr, janitor, DMwR, caret, caretEnsemble, recipes, yardstick, rpart, glmnet, nnet, PRROC, conflicted) # * * Custom Functions ---------------------------------------------------- get_all <- function(model){ Accuracy <- model$pred %>% group_by(Resample) %>% yardstick::accuracy(obs, pred) Sensitivity <- model$pred %>% group_by(Resample) %>% yardstick::sens(obs, pred) Specificity <- model$pred %>% group_by(Resample) %>% yardstick::spec(obs, pred) AUROC <- model$pred %>% group_by(Resample) %>% yardstick::roc_auc(obs, positive) Precision <- model$pred %>% group_by(Resample) %>% yardstick::precision(obs, pred) F1 <- model$pred %>% group_by(Resample) %>% yardstick::f_meas(obs, pred) AUPRC <- model$pred %>% group_by(Resample) %>% yardstick::pr_auc(obs, positive) Resample_result <- cbind.data.frame(Resample = Sensitivity$Resample, Accuracy = Accuracy$.estimate, Sensitivity = Sensitivity$.estimate, Specificity = Specificity$.estimate, AUROC = AUROC$.estimate, Precision = Precision$.estimate, F1 = F1$.estimate, AUPRC = AUPRC$.estimate) colnames(Resample_result)[-1] <- paste0(model$method, "~", colnames(Resample_result)[-1]) return(Resample_result) } prSummary0 <- function (data, lev = NULL, model = NULL) { caret:::requireNamespaceQuietStop("yardstick") if (length(levels(data$obs)) > 2) stop(paste("Your outcome has", length(levels(data$obs)), "levels. `prSummary0`` function isn't appropriate.", call. = FALSE)) if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) stop("Levels of observed and predicted data do not match.", call. = FALSE) auprc <- try(yardstick::pr_auc(data, obs, lev[1]), silent = TRUE) if (inherits(auprc, "try-error")){ auprc <- NA }else{ auprc <- auprc$.estimate } return(c(AUC = auprc, Precision = caret:::precision.default(data = data$pred, reference = data$obs, relevant = lev[1]), Recall = caret:::recall.default(data = data$pred, reference = data$obs, relevant = lev[1]), F = caret:::F_meas.default(data = data$pred, reference = data$obs, relevant = lev[1]))) } trainData %<>% mutate_if(is.character, as.factor) %>% clean_names() %>% mutate(class = relevel(class, ref = "positive")) #table(trainData$class) %>% prop.table() # * Data Preprocessing ---------------------------------------------------- recSteps = recipe(class ~ ., data = trainData) %>% step_nzv( all_predictors() ) %>% step_YeoJohnson(all_numeric()) %>% step_normalize( all_numeric() ) %>% step_dummy(all_nominal(), -all_outcomes() ) # create dummy variables for categorical predictors if (best_metric=="Accuracy") outputFun = defaultSummary if (best_metric=="ROC") outputFun = twoClassSummary if (best_metric=="AUC") outputFun = prSummary0 if (cases$subsampling[i]=="NULL") set.seed((11111+cases$ID[i])) if (cases$subsampling[i]=="down") set.seed((22222+cases$ID[i])) if (cases$subsampling[i]=="up") set.seed((33333+cases$ID[i])) if (cases$subsampling[i]=="smote") set.seed((44444+cases$ID[i])) seeds <- vector(mode = "list", length = 5) for(j in 1:5) seeds[[j]] <- sample.int(nrow(trainData), 1) ## For the last model: seeds[[6]] <- sample.int(nrow(trainData), 1) # * Cross Validation Setup ------------------------------------------------ fitControl = trainControl( method = "cv", # k-fold cross validation number = 5, # Number of Folds search = "grid", # Default grid search for parameter tuning sampling = subsampling, # If none, please set to NULL summaryFunction = outputFun, # see custom function in this file classProbs = T, # should class probabilities be returned selectionFunction = "best", # best fold savePredictions = 'final', index = createResample(trainData$class, 5), seeds = seeds) models = caretList(recSteps, data = trainData, metric=best_metric, tuneList= list( cart = caretModelSpec(method="rpart", tuneLength=30), glm = caretModelSpec(method = "glm", family= "binomial", maxit = 100) ), trControl=fitControl, continue_on_fail = T ) # How to extract the cross validation Results ---- temp = cbind(get_all(models$cart), get_all(models$glm)[,-1]) temp$subsampling = toupper(cases$subsampling[i]) # needs to be automatically read from the input to fitControl temp$metric = best_metric # needs to be automatically read for the input to caretList temp$dataset = dataset$dataset[cases$ID[i]] # needs to be automatically read for the dataset name temp$imbalance_ratio = dataset$imbalance_ratio[cases$ID[i]] temp2 = temp %>% pivot_longer(cols = contains('~')) temp2$method = str_detect(temp2$name, 'glm') temp2$method = ifelse(temp2$method == 'TRUE', 'GLM', 'CART') temp2$measure = str_replace(temp2$name, '.*~', '') temp2 %<>% select(dataset, imbalance_ratio, subsampling, metric, method, Resample, measure, value) return(temp2) } # allinone function returns either the performance measures (index=0) based on the testdata # given a method (either "glm" or "rpart") or the prediction result (index=1). # This is our custom function for a list of datasets and a test dataset. allinone <- function(i, datasets, testdata, cases, IR, method, index=0){ testdata0 <- testdata[[cases$ID[i]]] trainData = datasets[[cases$ID[i]]][[cases$iteration[i]]] best_metric <- cases$metrics[i] subsampling <- cases$subsampling[i] if (!(method %in% c("glm", "rpart"))) return("Error: This algorithm only works with 'glm' or 'rpart'.") if (!(subsampling %in% c("NULL", "down", "up", "smote"))) return("Error: This algorithm only works with subsampling strategies: 'NULL', 'down', 'up', 'smote'.") if (subsampling=="NULL") subsampling <- NULL # * Required Packages and Custom Functions -------------------------------- # * * Packages ------------------------------------------------------------ if(require(remotes)==FALSE) install.packages("remotes") if(require(DMwR)==FALSE) remotes::install_github("cran/DMwR") pacman::p_load(tidyverse, magrittr, janitor, DMwR, caret, caretEnsemble, recipes, yardstick, rpart, glmnet, nnet, PRROC, conflicted) # * Data Preprocessing ---------------------------------------------------- recSteps = recipe(State ~ ., data = trainData) %>% step_nzv( all_predictors() ) %>% step_YeoJohnson(all_numeric()) %>% step_normalize( all_numeric() ) if (best_metric=="Accuracy") outputFun = defaultSummary if (best_metric=="ROC") outputFun = twoClassSummary if (cases$subsampling[i]=="NULL") set.seed((11111+cases$ID[i])+cases$iteration[i]) if (cases$subsampling[i]=="down") set.seed((22222+cases$ID[i])+cases$iteration[i]) if (cases$subsampling[i]=="up") set.seed((33333+cases$ID[i])+cases$iteration[i]) if (cases$subsampling[i]=="smote") set.seed((44444+cases$ID[i])+cases$iteration[i]) seeds <- vector(mode = "list", length = 5) for(j in 1:5) seeds[[j]] <- sample.int(nrow(trainData), 100) ## For the last model: seeds[[6]] <- sample.int(nrow(trainData), 1) # * Cross Validation Setup ------------------------------------------------ fitControl = trainControl( method = "cv", # k-fold cross validation number = 5, # Number of Folds search = "grid", # Default grid search for parameter tuning sampling = subsampling, # If none, please set to NULL summaryFunction = outputFun, # see custom function in this file classProbs = T, # should class probabilities be returned selectionFunction = "best", # best fold savePredictions = 'final', index = createResample(trainData$State, 5), seeds = seeds) # Model Training ---------------------------------------------------------- if (method == "glm"){ model = try(train(recSteps, data = trainData, metric=best_metric, method="glm", family= "binomial", maxit = 100, trControl=fitControl ), silent=TRUE) } if (method == "rpart"){ model = try(train(recSteps, data = trainData, metric=best_metric, method="rpart", tuneLength = 30, trControl=fitControl ), silent=TRUE) } if (inherits(model, "try-error")){ final_result <- data.frame(cases[i,3:4], IR = IR[cases$ID[i]], Accuracy = NA, Sensitivity = NA, Specificity = NA, AUROC = NA, Precision = NA, F1 = NA, AUPRC = NA) return(final_result) } pred <- predict(model, testdata0) diseased <- predict(model, testdata0, type = "response")$diseased obs <- testdata0$State result <- data.frame(obs, pred, diseased) if (index==1) return(result) out1 <- defaultSummary(result, lev=levels(trainData$State)) out2 <- twoClassSummary(result, lev=levels(trainData$State)) out3 <- prSummary(result, lev=levels(trainData$State)) final_result <- data.frame(cases[i,3:4], IR = IR[cases$ID[i]], Accuracy = unname(out1["Accuracy"]), Sensitivity = unname(out2["Sens"]), Specificity = unname(out2["Spec"]), AUROC = unname(out2["ROC"]), Precision = unname(out3["Precision"]), F1 = unname(out3["F"]), AUPRC = unname(out3["AUC"])) return(final_result) } # The modeling() function requires at least three arguments: # trainData: a dataset that is used for training a giving model # testData: a dataset that is used for testing the performance of the model # response: a string, name of the binary response variable # subsampling: subsampling method (it takes one of: "NULL", "down", "up", "smote") # best_metric: the metric used to evaluate the performance of the model ("Accuracy" or "ROC", "AUC") # method: either "glm" or "rpart" (logistic regression or decision tree) # index = 0: returns the performance measures based on the test dataset # index = 1: returns the prediction result based on the test dataset # index = 2: returns both the preformance measures and prediction results based on the test dataset # seed: random seed modeling <- function(trainData, testData, response, subsampling, best_metric, method, index=0, seed=2021){ # Required Packages and Custom Functions if(require(remotes)==FALSE) install.packages("remotes") if(require(DMwR)==FALSE) remotes::install_github("cran/DMwR") pacman::p_load(tidyverse, magrittr, janitor, DMwR, caret, caretEnsemble, recipes, yardstick, rpart, glmnet, nnet, PRROC, conflicted) if (!(method %in% c("glm", "rpart"))) return("Error: This algorithm only works with 'glm' or 'rpart'.") if (!(subsampling %in% c("NULL", "down", "up", "smote"))) return("Error: This algorithm only works with subsampling strategies: 'NULL', 'down', 'up', 'smote'.") if (subsampling=="NULL") subsampling <- NULL i1 <- which(colnames(trainData)==response) i2 <- which(colnames(testData)==response) colnames(trainData)[i1] <- "class" colnames(testData)[i2] <- "class" trainData %<>% mutate_if(is.character, as.factor) %>% mutate(class = relevel(class, ref = levels(trainData$class)[1])) # * Data Preprocessing ---------------------------------------------------- recSteps = recipe(class ~ ., data = trainData) %>% step_nzv( all_predictors() ) %>% step_YeoJohnson(all_numeric()) %>% step_normalize( all_numeric() ) %>% step_dummy(all_nominal(), -all_outcomes() ) # create dummy variables for categorical predictors set.seed <- seed seeds <- vector(mode = "list", length = 5) for(j in 1:5) seeds[[j]] <- sample.int(nrow(trainData), 10) ## For the last model: seeds[[6]] <- sample.int(nrow(trainData), 1) if (best_metric=="Accuracy") outputFun = defaultSummary if (best_metric=="ROC") outputFun = twoClassSummary if (best_metric=="AUC") outputFun = prSummary # * Cross Validation Setup ------------------------------------------------ fitControl = trainControl( method = "cv", # k-fold cross validation number = 5, # Number of Folds search = "grid", # Default grid search for parameter tuning sampling = subsampling, # If none, please set to NULL summaryFunction = outputFun, # see custom function in this file classProbs = T, # should class probabilities be returned selectionFunction = "best", # best fold savePredictions = 'final', index = createResample(trainData$class, 5), seeds = seeds) # Model Training ---------------------------------------------------------- if (method == "glm"){ model = train(recSteps, data = trainData, metric=best_metric, method="glm", family= "binomial", maxit = 100, trControl=fitControl ) } if (method == "rpart"){ model = train(recSteps, data = trainData, metric=best_metric, method="rpart", tuneLength = 30, trControl=fitControl ) } pred <- predict(model, testData) prob <- predict(model, testData, type = "response")[levels(trainData$class)[1]] obs <- testData$class result <- data.frame(obs, pred, prob) if (index==1) return(result) out1 <- defaultSummary(result, lev=levels(trainData$class)) out2 <- twoClassSummary(result, lev=levels(trainData$class)) out3 <- prSummary(result, lev=levels(trainData$class)) final_result <- data.frame(Accuracy = unname(out1["Accuracy"]), Sensitivity = unname(out2["Sens"]), Specificity = unname(out2["Spec"]), AUROC = unname(out2["ROC"]), Precision = unname(out3["Precision"]), F1 = unname(out3["F"]), AUPRC = unname(out3["AUC"])) if (index==0) return(final_result) if (index==2) return(list(Performance=final_result, Prediction=result)) }