# Notes: This script includes functions that are useful for machine learning and other modeling and analysis projects # Author: John Curtin (jjcurtin@wisc.edu) # Set conflict rules to resolve conflicts between tidymodels and tidyverse tidymodels_conflictRules <- function(){ conflictRules("scales", mask.ok = c("discard"), exclude = c("col_factor")) conflictRules("recipes", mask.ok = c("fixed")) conflictRules("yardstick", mask.ok = c("spec")) # these next conflicts are between foreach and purrr. when() is deprecated and we # don't use accumulate in our workflow currently # foreach is loaded by doParallel so often used with tidymodels # Not sure if this is needed if we don't fully load doParallel # conflictRules("foreach", mask.ok = c("when", "accumulate")) } # pull out a coefficient from a fitted parametric model get_estimate <- function(the_fit, the_term){ the_fit %>% broom::tidy() %>% dplyr::filter(term == the_term) %>% dplyr::pull(estimate) } # prep a recipe and bake some data # By default, this function will glimpse your new features # Set glimpse_it to FALSE to suppress this make_features <- function(rec, data_trn, data_new = NULL, glimpse_it = TRUE){ features <- rec %>% recipes::prep(training = data_trn, strings_as_factors = FALSE) %>% recipes::bake(new_data = data_new) if (glimpse_it){ features %>% dplyr::glimpse() } return(features) } # makes a plot of observed (truth) vs estimate (predicted) plot plot_truth <- function(truth, estimate) { ggplot2::ggplot(mapping = aes(x = truth, y = estimate)) + ggplot2::geom_abline(lty = 2, color = "red") + ggplot2::geom_point(alpha = 0.5) + ggplot2::labs(y = "predicted outcome", x = "outcome") + tune::coord_obs_pred() # scale axes uniformly } # makes a hyperparameter plot for a model with up to 2 hyperparameters plot_hyperparameters <- function(tune_fit, hp1, hp2 = NULL, metric = NULL, log_hp1 = FALSE) { data <- tune::collect_metrics(tune_fit) metric_scores <- data %>% dplyr::filter(.metric == metric) %>% dplyr::pull(mean) x1 <- data[[hp1]] if (log_hp1) x1 <- log(x1) if (is.null(hp2)) { ggplot2::ggplot(mapping = aes(x = x1, y = metric_scores)) + ggplot2::geom_line() + ggplot2::xlab(hp1) + ggplot2::ylab(metric) } else { x2 <- factor(data[[hp2]], ordered = TRUE) ggplot2::ggplot(mapping = aes(x = x1, y = metric_scores, group = x2, color = x2)) + ggplot2::geom_line() + ggplot2::xlab(hp1) + ggplot2::ylab(metric) + ggplot2::scale_color_discrete(name = hp2) } } #modified code from Kuhn described here: https://github.com/topepo/caret/issues/116 get_lambdas <- function(x, y, len = 50, model = "LASSO") { numLev <- if(is.character(y) | is.factor(y)) length(levels(y)) else NA if(!is.na(numLev)) { fam <- ifelse(numLev > 2, "multinomial", "binomial") } else fam <- "gaussian" if (toupper(model) == "LASSO"){ alpha <- 1 } else alpha <- 0 init <- glmnet::glmnet(as.matrix(x), y, family = fam, nlambda = len+2, alpha = alpha) lambda <- unique(init$lambda) lambda <- lambda[-c(1, length(lambda))] lambda <- lambda[1:min(length(lambda), len)] } # Nadeau and Bengio (2003) correlated t-test nb_correlated_t_test <- function(cv_fits_full, cv_fits_compact, k = 10){ cv_metrics_full <- tune::collect_metrics(cv_fits_full, summarize = FALSE)$.estimate cv_metrics_compact <- tune::collect_metrics(cv_fits_compact, summarize = FALSE)$.estimate diffs <- cv_metrics_full - cv_metrics_compact n <- length(diffs) mean_diff <- mean(diffs) var_diffs <- var(diffs) proportion_test <- 1 / k proportion_train <- 1 - proportion_test correction <- (1 / n) + (proportion_test / proportion_train) se = sqrt(correction * var_diffs) t = abs(mean_diff/se) p_value <- 2 * pt(t, n - 1, lower.tail = FALSE) tibble::tibble(mean_diff = mean_diff, se = se, t = t, df = n - 1, p_value = p_value) } # rope_min and rope_max are the lower and the upper bound of the rope # if not NULL, plot_min and plot_max will return pdf for this range of metric diffs for plots bayesian_correlated_t_test <- function(cv_fits_full, cv_fits_compact, rope_min, rope_max, k = 10, plot_min = NULL, plot_max = NULL, plot_n = 1000){ if (rope_max < rope_min){ stop("rope_max should be larger than rope_min") } cv_metrics_full <- tune::collect_metrics(cv_fits_full, summarize = FALSE)$.estimate cv_metrics_compact <- tune::collect_metrics(cv_fits_compact, summarize = FALSE)$.estimate diffs <- cv_metrics_full - cv_metrics_compact delta <- mean(diffs) n <- length(diffs) df <- n - 1 stdX <- sd(diffs) rho = 1 / k sp <- sd(diffs)*sqrt(1/n + rho/(1-rho)) p.left <- pt((rope_min - delta)/sp, df) p.rope <- pt((rope_max - delta)/sp, df)-p.left results <- list('left'=p.left,'rope'=p.rope,'right'=1-p.left-p.rope) if (!is.null(plot_min) & !is.null(plot_max)) { plot_diffs <- seq(plot_min, plot_max, length.out = plot_n) ts <- (plot_diffs - delta) / sp pdf <- dt(ts, df) results$plot_diffs <- plot_diffs results$pdf <- pdf } return(results) } # Calculate variable importance for one variable or set of variables # model is parsnip model # x are features # y is outcome (factor for classification) # var_string is used to match to column names using contains(var_string) # fun_metric is any metric function of similar form to the _vec functions in yardstick # e.g. roc_auc_vec, rmse_vec, accuracy_vec # fun_pred is a prediction function that provides the appropriate estimate to # use with fun_metric. See example below for roc_auc_vec # n_reps is the number of permutations # compare sets the order of contrast. diff = metric - metrics_perm; # diff_rev = metrics_perm - metric # sample prediction function to use with yardstick::roc_auc_vec # predictor <- function(model, x){ # predict(model, x, type = "prob") %>% # pull(.pred_yes) # } get_vip <- function(model, x, y, var_string, fun_metric, fun_pred, n_reps = 20, compare = "diff"){ metric <- fun_metric(truth = y, estimate = fun_pred(model, x)) # requires foreach package attached in callng script metrics_perm <- foreach::foreach(rep = 1:n_reps, .combine='c') %do% { x %>% dplyr::mutate(dplyr::across(dplyr::contains(var_string), sample)) %>% fun_pred(model, .) %>% fun_metric(truth = y, estimate = .) } if (!compare %in% c("diff", "diff_rev", "ratio", "ratio_rev")) { stop('Valid values for compare are "diff", "diff_rev"') } if(compare == "diff") { metrics_final <- metric - metrics_perm } else { metrics_final <- metrics_perm - metrics } vars <- x %>% dplyr::select(contains(var_string)) %>% names() vip <- tibble(var_string, vars = list(vars), metric_mean = mean(metrics_final), metric_median = median(metrics_final), metric_05 = quantile(metrics_final, .05), metric_95 = quantile(metrics_final, .95)) return(vip) }