# Functions to accompany our analysis #_________________________________________________________________________ # Determining and Plotting the Data Types for all columns # Source: # https://stackoverflow.com/questions/21125222/determine-the-data-types-of-a-data-frames-columns col_missing_function <- function(input_data) { # row_missing_function(): a function that finds the % of missing value in each column of the data # input_data: a data frame na_count_col <- apply(input_data, 2, function(y) length(which(is.na(y)))/nrow(input_data)) # counting nas in each column na_count_col <- data.frame(na_count_col) #transforming this into a data_frame return(na_count_col) } dataTypes <- function(input_data) { # dataTypes(): a function that finds the data type for each variable in the data # input_data: a data frame col.classes <- lapply(input_data, class) col.classes.input_data <- unlist(col.classes) %>% data.frame() temp.data <- table(col.classes.input_data) %>% sort(decreasing=T) %>% data.frame() return(temp.data) } detect_terms <- function(x, term){ # detect_terms(): a function that is used to detect a term in each row of the data # x: a row in the data # term: a term needs to detect result <- sapply(x, function(y) any(gsub(" ", "", strsplit(y, ",")[[1]])==term)) if (all(is.na(result))) return(NA) result <- ifelse(is.na(result), FALSE, result) if (any(result==TRUE)) return("Y") return("N") } DOE_function <- function(ii, design_scenario, all_data, y, all_ID, holdout_ID_index, all_features, return.prediction=FALSE, seed) { # DOE_function(): a function that applies the data to the model specified in the design_scenario # ii: index used to navigate to different scenario # design_scenario: a data frame that contains all scenarios # all_data: a list of all datasets # y: the response variable that we want to keep in the data # all_ID: a list of all sample IDs for the datasets # holdout_ID_index: a list allows to find the corresponding holdout data # all_features: a list of features based on the feature selection methods # return.prediction: TRUE returns the predicted probabilities for both training and holdout data # seed: random seed # since this function is usually used with a parallel algorithm, we include all # necessary packages if(require(pacman)==FALSE) install.packages("pacman") # needs to be installed first # p_load is equivalent to combining both install.packages() and library() pacman::p_load(AUC, caret, conflicted, DMwR, dplyr, e1071, gbm, kernlab, MASS, naivebayes, ranger, ROSE, xgboost) conflict_prefer("select", "dplyr") scenario <- design_scenario[ii,] set.seed(seed) # obtain the necessary information regarding the scenario fold <- scenario$fold data_scenario <- paste(scenario$imputation_num, scenario$imputation_cat, scenario$encoding, sep="_") features <- all_features[[scenario$feature_selection]][[fold]][[data_scenario]] algorithm <- scenario$algorithm subsampling <- scenario$subsampling ID_index <- holdout_ID_index[[fold]] input_data <- all_data[[data_scenario]] Holdout_ID <- all_ID[[data_scenario]][ID_index[[data_scenario]][1]:ID_index[[data_scenario]][2]] # obtain the training and hold_out datasets training_data <- input_data[!input_data$ID%in%Holdout_ID,] %>% select(c(features, y)) hold_out <- input_data[input_data$ID%in%Holdout_ID,] %>% select(c(features, y)) # make sure the response variable is categorical training_data[[y]] <- as.factor(training_data[[y]]) levels(training_data[[y]]) <- list(Survived="Survived", Died="Died") hold_out[[y]] <- as.factor(hold_out[[y]]) levels(hold_out[[y]]) <- list(Survived="Survived", Died="Died") # define the formula that we will use in the modeling formul <- as.formula(paste0(as.character(y), "~.")) # decide subsampling method used in the modeling if (subsampling=="none"){ sampling.method <- NULL }else{ sampling.method <- subsampling } # define g-mean function gmeanfunction <- function(data, lev = NULL, model = NULL) { sub_sens<-caret::sensitivity(data$pred, data$obs) sub_spec<-caret::specificity(data$pred, data$obs) return(c(gmean = sqrt(sub_sens*sub_spec))) } # 5 cross validation is used control_setting <- caret::trainControl(method = "cv", number = 5, sampling = sampling.method, verboseIter = TRUE, search ="random", classProbs = TRUE, selectionFunction = "best", summaryFunction = gmeanfunction) # decide the method used in the train function find_train_method <- function(input_algorithm){ All_algorithms <- c("ANN", "Bagging", "DT", "ElasticNet", "KPLSR", "LDA", "LR", "NB", "RF", "SVM", "XGB") All_methods <- c("nnet", "treebag", "rpart", "glmnet", "kernelpls", "lda", "glm", "naive_bayes", "ranger", "svmRadial", "xgbDART") method <- All_methods[which(All_algorithms==input_algorithm)] return(method) } alg_method <- find_train_method(algorithm) scenario$encoding <- ifelse(scenario$encoding=="OneHot", "One-Hot", scenario$encoding) # train the model Success <- 1 if (alg_method%in% c("glm", "kernelpls", "nnet", "svmRadial")){ if((class(try( result_model <- train(formul, data=training_data, method=alg_method, family="binomial", trControl = control_setting, metric="gmean"), silent = TRUE))=="try-error")[1]){Success<-0} }else if (alg_method%in%c("earth", "gbm", "glmnet", "naive_bayes", "ranger", "rpart", "xgbDART", "xgbTree")){ if((class(try( result_model <- train(formul, data=training_data, method=alg_method, trControl = control_setting, tuneLength=10, metric="gmean"),silent = TRUE))=="try-error")[1]){Success<-0} }else if(alg_method=="lda"){ if((class(try( result_model <- train(formul, data=training_data, method=alg_method, preProcess="pca", preProcOptions = list(method="BoxCox"), trControl = control_setting, metric="gmean"),silent = TRUE))=="try-error")[1]){Success<-0} }else if(alg_method=="treebag"){ if((class(try( result_model <- train(formul, data=training_data, method=alg_method, family="binomial", trControl = control_setting, tuneLength=10, metric="gmean"),silent = TRUE))=="try-error")[1]){Success<-0} } if (Success==1){ Predicted_Probability <- rep(list(NA), 2) names(Predicted_Probability) <- c("training_data", "holdout_data") Predicted_Probability[["holdout_data"]] <- as.data.frame(matrix(NA, nrow=nrow(hold_out), ncol=3)) colnames(Predicted_Probability[["holdout_data"]]) <- c(y, algorithm,"probability") Predicted_Probability[["holdout_data"]][,algorithm] <- predict(result_model, newdata=hold_out, type="raw") Predicted_Probability[["holdout_data"]][,"probability"] <- predict(result_model, newdata=hold_out, type="prob")[,2] # to simplify the code, we define the column "probability" for the holdout_data Predicted_Probability as P # R as the predicted values for the response variable P <- Predicted_Probability[["holdout_data"]][,"probability"] R <- Predicted_Probability[["holdout_data"]][,algorithm] Auc <- AUC::auc(roc(P, hold_out[[y]])) Sen <- caret::sensitivity(R, hold_out[[y]]) Spec <- caret::specificity(R, hold_out[[y]]) Accu <- (as.data.frame(confusionMatrix(R, hold_out[[y]])$overall))[1,] G_mean <- sqrt(Sen*Spec) Result <- unname(c(unlist(scenario), Auc, Sen, Spec, Accu, G_mean)) if (return.prediction==FALSE){ return(Result) }else{ Predicted_Probability[["training_data"]] <- as.data.frame(matrix(NA, nrow=nrow(training_data), ncol=3)) colnames(Predicted_Probability[["training_data"]]) <- c(y, algorithm,"probability") Predicted_Probability[["training_data"]][,y] <- training_data[,y] Predicted_Probability[["training_data"]][,algorithm] <- predict(result_model, newdata=training_data, type="raw") Predicted_Probability[["training_data"]][,"probability"] <- predict(result_model, newdata=training_data, type="prob")[,2] Predicted_Probability[["holdout_data"]][,y] <- hold_out[,y] return(list(Result, Predicted_Probability)) } }else{ return(c(unname(unlist(scenario)), NA, NA, NA, NA, NA)) } } dropLowVar <- function(input_data, percent) { # dropLowVar(): a function that drops categorical variables that with less variation # input_data: a data frame # percent: the maximum relative frequency (out of non-missing values) is higher that this value charColsToKeep <- input_data[, sapply(input_data, class)=="character"] %>% discard(~ max(table(.x))/sum(!is.na(.x)) * 100 >= percent) %>% colnames() nonCharColsToKeep = input_data[, sapply(input_data, class)!="character"] %>% names() input_data %<>% select(c(all_of(charColsToKeep), all_of(nonCharColsToKeep)) ) %>% select(sort(names(.))) return(input_data) } dropNumericMissing <- function(input_data, percent) { # dropNumericMissing(): a function that drops numerical variables containing too many missing variables # input_data: a data frame # percent: the % of missing value is higher that this value, we drop this variable numColsToKeep <- input_data[, sapply(input_data, class)=="numeric"] %>% discard(~ sum(is.na(.x))/length(.x) * 100 >= percent) %>% colnames() nonNumColsToKeep = input_data[, sapply(input_data, class)!="numeric"] %>% names() input_data %<>% select(c(all_of(numColsToKeep), all_of(nonNumColsToKeep)) ) %>% select(sort(names(.))) return(input_data) } encode_category <- function(input_data, method) { # encode_category(): a function that encodes categorical variables # input_data: a data frame # method: Label or OneHot input_data %<>% mutate_all(as.factor) if (method=="Label"){ input_data %<>% mutate_all(as.numeric) %>% mutate_all(as.factor) %>% mutate_all(droplevels) }else if (method=="Onehot"){ numLevels <- apply(input_data, 2, n_distinct) dependence <- cumsum(unname(numLevels)) input_data <- one_hot(data.table(input_data), dropCols = TRUE) input_data %<>% select(!all_of(dependence)) %>% mutate_all(as.factor) } return(input_data) } Get_holdout_index <- function(ID) { # Get_holdout_index(): a function that returns a list of intervals that shows # the indices (a,b) for the holdout data corresponding to ID[a,b] # ID: a list that contains ID for several datasets k <- unlist(lapply(ID, length)) I <- sapply(round(k/5), function(x) x*seq(0,4)) ID_data <- data.frame(rbind(I, k)) row.names(ID_data) <- NULL ID_index <- rep(list(NULL), 5) for (i in 1:5){ ID_index[[i]] <- lapply(ID_data, function(x) c((x[i]+1), x[(i+1)])) } return(ID_index) } recodeGSTATUS <- function(gStatus, gTime, targetYear = 1) { # recodeGSTATUS(): a function that decides the survival status at targetYear # gStatus: a binary vector # gTime: a continuous vector regarding time # targetYear: target year gStatus %<>% as.numeric() gTime %<>% as.numeric() response <- ifelse(gTime > 365*targetYear, 'Survived', ifelse(gStatus==0, NA, 'Died') ) return(response) } row_missing_function <- function(input_data) { # row_missing_function(): a function to find the % of missing value in each row of the data # input_data: a data frame na_count_row <- apply(input_data, 1, function(y) length(which(is.na(y)))/ncol(input_data)) # counting nas in each row na_count_row <- data.frame(na_count_row) #transforming this into a data_frame return(na_count_row) } search_complete <- function(input_data, max.iter=100000) { # search_complete(): a function for a heuristic algorithm which drops rows # and columns based on the percentage of missing values until # just non-empty cells are remained. # input_data: a data frame # max.iter: the maximum iterations iter <- 0 data_temp <- input_data while (iter<=max.iter){ count_col <- col_missing_function(input_data) count_row <- row_missing_function(input_data) max_col <- max(count_col) max_row <- max(count_row) max_emp <- max(max_col,max_row) if (max_col==0){break()} if (max_row==0){break()} if (max_emp==max_row){ if((nrow(input_data))>3){ a <- which(count_row$na_count_row==max_row) data_temp <- input_data[-a,] }else{ print("Data have less than 4 rows! The algorithm fails.") break() } } if (max_col>=max_row){ b <- names(input_data[which(count_col$na_count_col==max_col)]) if (max_emp==max_col){ if ((ncol(input_data))>2){ #in the next lines, we try to find which columns are the emptiest and then take out the one that I don't want to be deleted f <- which(count_col$na_count_col==max_col) data_temp <- input_data[, -f] }else{ print("Data have less than 3 columns! The algorithm fails") break() } } }else{ if ((nrow(input_data))>3){ a <- which(count_row$na_count_row==max_row) data_temp <- input_data[-a,] }else{ print("Data have less than 4 rows! The algorithm fails.") break() } } iter <- iter + 1 input_data <- data_temp } if (iter>max.iter){ print("The algorithm fails. More iterations needed!") }else{ return(data_temp) } } select_variables <- function(ii, all_data, y, method, ID_all, ID_index, seed=201905) { # select_vars(): a function for the variable selection # ii: index used to navigate to different data sets # all_data: a list of all datasets # y: the response variable that we want to keep in the data # method: FFS, LASSO, RF # ID_all: a list of all sample IDs for the datasets # ID_index: a list allows to find the corresponding holdout data # seed: random seed if(require(pacman)==FALSE) install.packages("pacman") pacman::p_load(Biocomb, glmnet, party, ranger) set.seed(seed) input_data <- as.data.frame(all_data[[ii]]) all_types <- unlist(lapply(input_data, class)) char.index <- which(all_types=="factor") ID <- ID_all[[ii]][ID_index[[ii]][1]:ID_index[[ii]][2]] input_data <- input_data[!input_data$ID%in%ID,] input_data[,char.index] <- lapply(input_data[,char.index], droplevels) input_data <- input_data[,colnames(input_data)!="ID"] input_data[,y] <- as.factor(input_data[,y]) X <- input_data[,colnames(input_data)!=y] if (method=="FFS"){ X$y <- input_data[,y] disc <- "MDL" threshold <- 0.001 attrs.nominal <- numeric() result_temp <- Biocomb::select.fast.filter(X, disc.method=disc, threshold=threshold, attrs.nominal=attrs.nominal) Imp_vars <- as.character(result_temp$Biomarker) }else if (method=="LASSO"){ '%ni%' <- Negate('%in%') X <- data.matrix(apply(X, 2, as.numeric)) glmnet1 <- glmnet::cv.glmnet(x=X,y=as.factor(input_data[,y]),type.measure='auc', family="binomial") co <- coef(glmnet1,s = "lambda.1se") co <- as.matrix(co[-1,]) Imp_vars <- row.names(co)[which((co[,1]!=0))] Imp_vars <- Imp_vars[Imp_vars %ni% '(Intercept)'] }else if (method=="RF"){ Success <- 0 iteration <- 0 while (Success==0 & iteration <=10){ rf <- ranger(formula(paste0(y, "~.")), data=input_data, num.trees = 500, importance = "permutation", scale.permutation.importance = FALSE, mtry = floor(0.2 * ncol(X)), min.node.size = floor(0.1 * nrow(X)), num.threads = 1, write.forest = TRUE) if (min(importance(rf))<0){ Imp_values <- importance(rf)/abs(min(importance(rf))) Imp_vars <- names(which(Imp_values>1)) Success <- 1 }else{ iteration <- iteration + 1 } } if (iteration>10){ print("At least 10 runs with no negative importance score!") Imp_vars <- NA } } output <- list(Imp_vars) names(output) <- names(ID_all)[ii] return(output) } train_iso<-function(iii, t_data,h_data,features,folds=5,resampl_meth="up",alg_used,seed0=2019){ # train_iso(): a function that train a desired machine learning algorithm over multiple time stamps # iii: index used to navigate to different time stamps # t_data: the train datasets # t_data: the holdout datasets # features: independent features # folds: number of hods for cross validation # resampl_meth: the resampling algorithm for balancing the dependent variable # alg_used: the selected machine learning algorithm # seed: the random seed # since this function is usually used with a parallel algorithm, we include all # necessary packages gc() if(!"pacman" %in% rownames(installed.packages())){ install.packages(pkgs = "pacman",repos = "http://cran.us.r-project.org") } # p_load is equivalent to combining both install.packages() and library() pacman::p_load(caret,AUC,MASS,ROSE,DMwR,snow,ranger,parallel,xgboost,gbm,naivebayes,e1071,kernlab,pls,parallel) set.seed(seed0) fold_no<-folds alg_fact<-alg_used resample_fact<-resampl_meth coef<-"NO" var_imp<-"NO" df <- t_data[[iii]] hold_out_ <- h_data[[iii]] traindata <- df[c(as.character(features[[paste0("year",iii-1)]][,]),paste0("year",iii-1) )] hold_out<-hold_out_[c(as.character(features[[paste0("year",iii-1)]][,]),paste0("year",iii-1),"ID" )] TARGET0<-paste0("year",iii-1) traindata[[TARGET0]] <- as.factor(traindata[[TARGET0]]) levels(traindata[[TARGET0]]) <- list(Survival="Survival", Death="Death") hold_out[[TARGET0]] <- as.factor(hold_out[[TARGET0]]) levels(hold_out[[TARGET0]]) <- list(Survival="Survival", Death="Death") formul<-as.formula(paste0(as.character(TARGET0),"~.")) run.code<-"YES" sampling.method<-resample_fact if(resample_fact=="none"){sampling.method<-NULL} # Reference for geometric mean # Kim, Myoung-Jong, Dae-Ki Kang, and Hong Bae Kim. "Geometric mean based boosting # algorithm with over-sampling to resolve data imbalance problem for bankruptcy prediction." # Expert Systems with Applications 42.3 (2015): 1074-1082. gmeanfunction <- function(data, lev = NULL, model = NULL) { sub_sens<-caret::sensitivity(data$pred,data$obs) sub_spec<-caret::specificity(data$pred,data$obs) c(gmean = sqrt(sub_sens*sub_spec)) } # I used 5 fold cross validation control_setting <- caret::trainControl(method = "cv", number=fold_no, sampling=sampling.method , #summaryFunction = twoClassSummary, # the next one is for saving the predictions savePredictions = "all", search="random", classProbs = TRUE, selectionFunction="best" ,summaryFunction = gmeanfunction) if (alg_fact%in% c("glm", "nnet", "svmRadial")){ if((class(try( result_model <- train(formul, data=traindata, method=alg_fact, family="binomial", trControl = control_setting, metric="gmean"),silent = TRUE))=="try-error")[1]){run.code<-"NO"} }else if (alg_fact%in%c("rf", "gbm", "earth", "rpart", "xgbTree", "naive_bayes","xgbDART" ,"ranger","glmnet")){ if((class(try( result_model <- train(formul, data=traindata, method=alg_fact, trControl = control_setting, tuneLength=10, metric="gmean"),silent = TRUE))=="try-error")[1]){ run.code<-"NO"} }else if(alg_fact=="lda"){ if((class(try( result_model <- train(formul, data=traindata, method=alg_fact, preProcess="pca", preProcOptions = list(method="BoxCox"), trControl = control_setting, metric="gmean"),silent = TRUE))=="try-error")[1]){run.code<-"NO"} }else if(alg_fact=="treebag"){ if((class(try( result_model <- train(formul, data=traindata, method=alg_fact, family="binomial", trControl = control_setting, tuneLength=10, metric="gmean"),silent = TRUE))=="try-error")[1]){run.code<-"NO"} } if(run.code=="YES"){ coef<-as.data.frame(summary(result_model)$coefficients) coef$vars<-rownames(coef) coef$vars<-as.character(sapply(coef$vars,function(x) gsub("1","",x))) coef$vars[which(coef$vars=="(Intercept)")]<-"INTERCEPT" coef$vars[match("DAYS_STAT1",rownames(coef))]<-"DAYS_STAT1" coef$vars[match("DAYS_STAT1A",rownames(coef))]<-"DAYS_STAT1A" coef$vars[match("DAYS_STAT1B",rownames(coef))]<-"DAYS_STAT1B" if(alg_fact=="glm"){ var_imp<-as.data.frame(caret::varImp(result_model)$importance) var_imp$vars<-rownames(var_imp) var_imp$vars<-as.character(sapply(var_imp$vars,function(x) gsub("1","",x))) var_imp$vars[match("DAYS_STAT1",rownames(var_imp))]<-"DAYS_STAT1" var_imp$vars[match("DAYS_STAT1A",rownames(var_imp))]<-"DAYS_STAT1A" var_imp$vars[match("DAYS_STAT1B",rownames(var_imp))]<-"DAYS_STAT1B" } resul_raw <- as.data.frame(matrix(NA, ncol = 4, nrow = nrow(hold_out))) colnames(resul_raw) <- c("TARGET_raw", alg_fact, "Probability","ID") resul_raw$TARGET_raw <- hold_out[as.character(TARGET0)] resul_raw$ID<-hold_out$ID #train_raw <- as.data.frame(matrix(NA, ncol = 3, nrow = nrow(traindata))) #colnames(train_raw) <- c("TARGET", alg_fact, "Probability") #train_raw$TARGET <- traindata[as.character(TARGET0)] resul_pred_perf<-as.data.frame(matrix(NA, ncol = 1, nrow = 5)) colnames(resul_pred_perf)<-c(alg_fact) rownames(resul_pred_perf)<-c("auc","sen","spec","accu","gmean") resul_pred_perf_folds<-as.data.frame(matrix(NA, ncol = fold_no, nrow = 5)) colnames(resul_pred_perf_folds)<-paste0("fold",1:fold_no) rownames(resul_pred_perf_folds)<-c("auc","sen","spec","accu","gmean") fold_res<-list() for (j in 1:fold_no) { fold_res[[j]]<-result_model$pred[which(result_model$pred$Resample==paste0("Fold",j)),] resul_pred_perf_folds["auc",j]<-AUC::auc(roc(fold_res[[j]]$Survival ,fold_res[[j]]$obs)) resul_pred_perf_folds["sen",j]<-caret::sensitivity(fold_res[[j]]$pred,fold_res[[j]]$obs) resul_pred_perf_folds["spec",j]<-caret::specificity(fold_res[[j]]$pred,fold_res[[j]]$obs) resul_pred_perf_folds["accu",j]<-(as.data.frame(confusionMatrix(fold_res[[j]]$pred, fold_res[[j]]$obs)$overall))[1,] resul_pred_perf_folds["gmean",j]<-sqrt(resul_pred_perf_folds["sen",j]*resul_pred_perf_folds["spec",j]) } fold_res[["performance"]]<-resul_pred_perf_folds resul_raw[alg_fact] <- predict(result_model, newdata=hold_out, type="raw") resul_raw$Probability <- predict(result_model, newdata=hold_out, type="prob")[,1] resul_pred_perf[1,1] <- AUC::auc(roc(resul_raw$Probability,as.factor(as.data.frame(hold_out[,TARGET0])[,1]))) resul_pred_perf[2,1] <- caret::sensitivity(resul_raw[,alg_fact],as.factor(as.data.frame(hold_out[,TARGET0])[,1])) resul_pred_perf[3,1] <- caret::specificity(resul_raw[,alg_fact],as.factor(as.data.frame(hold_out[,TARGET0])[,1])) resul_pred_perf[4,1] <- (as.data.frame(confusionMatrix(resul_raw[,alg_fact], as.factor(as.data.frame(hold_out[,TARGET0])[,1]))$overall))[1,] resul_pred_perf[5,1] <- sqrt(resul_pred_perf[2,1]*resul_pred_perf[3,1]) gmean_overal<-as.data.frame(result_model$results) gmean_folds<-as.data.frame(result_model$resample) equat<-as.data.frame(result_model$finalModel$coefficients) names(equat)<-"coef" equat$vars<-rownames(equat) equat$vars[which(equat$vars==c("(Intercept)"))]<-c("INTERCEPT") max_gmean<-max(result_model$resample$gmean) fold_pick<-result_model$resample[which(result_model$resample$gmean==max_gmean),c("Resample")] foldbest<-result_model$pred[which(result_model$pred$Resample==fold_pick),c("obs","pred","Survival")] names(foldbest)<-c(paste0("year",iii-1),"log","Probability") dir.create(paste0(getwd(),"/data_pile")) # the next line remove everything except for the mentioned data franes rm(list=setdiff(ls(), c("resul_pred_perf","resul_raw","alg_fact","iii","coef","var_imp", "gmean_overal","gmean_folds","equat","foldbest","fold_res"))) saveRDS(list(Performance=resul_pred_perf, Predicted=resul_raw,coef=coef,var_imp=var_imp,gmean_overal=gmean_overal,gmean_folds=gmean_folds, equat=equat,foldbest=foldbest ), paste0(getwd(),"/data_pile/",alg_fact,"-year-",iii-1,".rds")) return(list(Performance=resul_pred_perf, Predicted=resul_raw,coef=coef,var_imp=var_imp,gmean_overal=gmean_overal,gmean_folds=gmean_folds, equat=equat,foldbest=foldbest,folds_res=fold_res)) }else{ dir.create(paste0(getwd(),"/data_pile")) rm(list=setdiff(ls(), c("experiment.summary","alg_fact","iii","coef","var_imp"))) save.image(paste0(getwd(),"/data_pile/NOT-",alg_fact,"-",iii,".RData")) saveRDS(list(Performance="NOT", Predicted="NOT",coef=coef,var_imp=var_imp,gmean_overal="NOT",gmean_folds="NOT", equat="NOT",foldbest="NOT"), paste0(getwd(),"/data_pile/NOT-",alg_fact,"-year-",iii-1,".rds")) return(list(Performance="NOT", Predicted="NOT",coef=coef,var_imp=var_imp,gmean_overal="NOT",gmean_folds="NOT", equat="NOT",foldbest="NOT",folds_res="NO")) } } select_variables_iso<-function(ii,data,method,exclud="ID",folds=5,alpha=1,seed=110){ # select_variables_iso(): a function for the variable selection for mutiple time stamps # ii: index used to navigate to different data sets # data: a list of all datasets # exclud: the variable(s) that should be excluded from the dataserts # method: Fast Feature Correlation Based Selection (FFS), Least absolute shrinkage and selection operator (LASSO), Random Forest (RF) # seed: the random seed set.seed(seed) dependent<-paste0("year",ii-1) excluds<-c(exclud,dependent) dataset<-as.data.frame(data[[ii]]) X<-dataset[!names(dataset)%in%excluds] input_data<-X input_data[,dependent] <- as.factor(dataset[,dependent]) if (method=="FFS"){ X$y <- as.factor(dataset[,dependent]) disc <- "MDL" threshold <- 0.001 attrs.nominal <- numeric() result_temp <- Biocomb::select.fast.filter(X, disc.method=disc, threshold=threshold, attrs.nominal=attrs.nominal) Imp_vars <- as.data.frame(as.character(result_temp$Biomarker)) colnames(Imp_vars) <- c("variables") Imp_vars <- list(Imp_vars) names(Imp_vars) <- paste0("year", ii-1) }else if (method=="LASSO"){ '%ni%' <- Negate('%in%') X <- data.matrix(apply(X, 2, as.numeric)) glmnet1 <- glmnet::cv.glmnet(x=X,y=as.factor(dataset[,dependent]),type.measure='auc',nfolds=folds,alpha=alpha, family="binomial") co <- coef(glmnet1,s = "lambda.1se") co <- as.matrix(co[-1,]) Imp_vars <- row.names(co)[which((co[,1]!=0))] Imp_vars <- as.data.frame(Imp_vars[Imp_vars %ni% '(Intercept)']) colnames(Imp_vars) <- c("variables") Imp_vars <- list(Imp_vars) names(Imp_vars) <- paste0("year", ii-1)}else if (method=="RF"){ Success <- 0 iteration <- 0 while (Success==0 & iteration <=10){ rf <- ranger(formula(paste0(dependent, "~.")), data=input_data, num.trees = 500, importance = "permutation", scale.permutation.importance = FALSE, mtry = floor(0.2 * ncol(X)), min.node.size = floor(0.1 * nrow(X)), num.threads = 1, write.forest = TRUE) if (min(importance(rf))<0){ Imp_values <- importance(rf)/abs(min(importance(rf))) Imp_vars <- names(which(Imp_values>1)) Success <- 1 }else{ iteration <- iteration + 1 } } if (iteration>10){ print("At least 10 runs with no negative importance score!") Imp_vars <- NA } Imp_vars <- as.data.frame(Imp_vars) colnames(Imp_vars) <- c("variables") Imp_vars <- list(Imp_vars) names(Imp_vars) <- paste0("year", ii-1) } return(Imp_vars) }