# Tree Mased Models Exercise # setwd() # getwd() # .libPaths(c("C:/R/library", "C:/Program Files/R/R-3.5.1/library")) # .libPaths() # load require packages ---- library(ggplot2) # graphing library(sf) # spatial data library(mapview) # maps library(corrplot) # graphical display of correlation matrix library(rpart) # recursive partitioning library(rpart.plot) # plots for part models library(randomForest) # random forest library(ranger) # faster random forest library(caret) # classification and regression trees library(ipred) # resampled error predictions library(caTools) # utility fxns and model eval library(raster) # raster data analysis tools # Part 1 - Explore data ---- # Get Data from West Virginia and plot url <- 'https://raw.githubusercontent.com/ncss-tech/stats_for_soil_survey/master/data/logistic/wv_transect_editedforR.csv' download.file(url, "soildata.csv") soildata <- read.csv("soildata.csv", header=TRUE, sep = ",") soildata <- st_as_sf( soildata, # set the coordinates coords = c("x", "y"), # set the projection crs = 4326 ) # plot points mapview(soildata) # Write a shapefile in your working directory for viewing in GIS write_sf(soildata, dsn = "soildata.shp") # Create a map of the data with a GIS ## EDA for WV dataset ---- # view the data View(soildata) # examine the internal data structure soildata <- read.csv("soildata.csv", header = TRUE, sep = ",") str(soildata) soildata$spodint <- as.factor(soildata$spodint) soildata$spodint <- ordered(soildata$spodint) soildata$tipmound <- as.factor(soildata$tipmound) soildata$tipmound <- ordered(soildata$tipmound) # does solar radiation affect spodic intensity? ggplot(soildata, aes(x = spodint, y = solar)) + geom_boxplot() + xlab("spodic intensity") + ylab("solar") # how about aspect? ggplot(soildata, aes(x = spodint, y = northwestn)) + geom_boxplot() + xlab("spodic intensity") + ylab("northwestness") # distribution of O horizon thickness among soil orders ggplot(soildata, aes(x = Otot)) + geom_density() + facet_wrap(~ order) # combine numeric columns into a new data frame numeric <- soildata |> dplyr::select(where(is.numeric)) names(numeric) # calculate correlation matrix cormatrix <- cor(numeric) # plot correlation matrix corrplot(cormatrix, method = "circle") ## Questions ---- # 1. What does the correlation matrix tell you about the environmental covariate data? # 2. Which predictors are highly correlated? # Part 2 - Build a regression tree model, prune using cp, compare model accuracy automate the CP step further?? ---- idx <- createDataPartition(soildata$depth_cm, p = 0.85, list = FALSE) val <- soildata[idx * -1, ] cal <- soildata[idx, ] nrow(cal) nrow(val) # build a regression tree model for soil depth tree_reg_model <- rpart(depth_cm ~ rainfall + dem10m + downslpgra + eastness + greenrefl + landsatb1 + landsatb2 + landsatb3 +landsatb7 + maxc100 + maxent + minc100 + mirref + ndvi+ northeastn + northness + northwestn + planc100 + proc100 + protection + relpos11 + slp50 + solar + tanc75, data = cal, method = "anova") # examine model outputs tree_reg_model # plot the regression tree model par(mar = c(3,6,3,6)) plot(tree_reg_model) text(tree_reg_model, cex=0.85) prp(tree_reg_model, type = 1, extra = 1, branch = 1) # inspect the CP values print(tree_reg_model$cptable) plotcp(tree_reg_model) # prune model based on the best CP value printcp(tree_prune) prp(tree_prune, type = 1, extra = 1, branch = 1) # predict soil depth for the validation dataset using the regression tree model class_prediction <- predict(object = tree_reg_model, newdata = val) # Compute the RMSE RMSE(pred = class_prediction, obs = val$depth_cm) # predict soil depth for the validation dataset using the pruned regression tree model class_prediction <- predict(object = tree_prune, newdata = val) # Compute the RMSE RMSE(pred = class_prediction, obs = val$depth_cm) # Part 3 - Build a classification tree model and evaluate model accuracy ---- # import training data soildata <- read.csv("soildata.csv", header=TRUE, sep=",") # wrangle the training data soildata$spodint <- as.factor(soildata$spodint) soildata$spodint <- ordered(soildata$spodint) soildata$tipmound <- as.factor(soildata$tipmound) soildata$tipmound <- ordered(soildata$tipmound) # index for lookup table index <- c(0, 0.5, 1, 1.5, 2) # assigning corresponding categories to look up values values <- c("nonspodic", "nonspodic", "spodic", "spodic", "spodic") # match spodint to index and assign values soildata$spo_index <- values[match(soildata$spodint, index)] # convert new column from character to factor soildata$spo_index <- as.factor(soildata$spo_index) # Splitting into cal/val with 15% split idx <- createDataPartition(soildata$spo_index, p = 0.85, list = FALSE) val <- soildata[idx * -1, ] cal <- soildata[idx, ] nrow(cal) nrow(val) # build a classification tree model using Gini index tree_class_model_gini <- rpart(soildata$spo_index ~ rainfall + geology + aachn + dem10m + downslpgra + eastness + greenrefl + landsatb1 + landsatb2 + landsatb3 +landsatb7 + maxc100 + maxent + minc100 + mirref + ndvi+ northeastn + northness + northwestn + planc100 + proc100 + protection + relpos11 + slp50 + solar + tanc75, data = soildata, method = "class", parms = list(split = "gini")) # inspect model outputs tree_class_model_gini plot(tree_class_model_gini) text(tree_class_model_gini, cex=0.8) #cex is text size par(mar = c(3,6,3,6)) plot(tree_class_model_gini) text(tree_class_model_gini, cex = 0.6) rpart.plot(tree_class_model_gini, extra = 3) rpart.plot(tree_class_model_gini, extra = 103) # predict landcover for the validation dataset tree_pred_gini <- predict(tree_class_model_gini, newdata = val, type = "class", datatype = "INT1U", progress = "text") # Evaluate model results using a confusion matrix results <- data.frame(val$spo_index, tree_pred_gini) cm1 <- confusionMatrix(val$spo_index, tree_pred_gini, positive = NULL, prevalence = NULL, mode = "everything") cm1 print(cm1$overall[1]) # import training data - Eval by Observed Class soildata <- read.csv("soildata.csv", header=TRUE, sep=",") # wrangle the training data soildata$spodint <- as.factor(soildata$spodint) soildata$spodint <- ordered(soildata$spodint) soildata$tipmound <- as.factor(soildata$tipmound) soildata$tipmound <- ordered(soildata$tipmound) # index for lookup table index <- c(0, 0.5, 1, 1.5, 2) # assigning corresponding categories to look up values values <- c("nonspodic", "nonspodic", "spodic", "spodic", "spodic") # match spodint to index and assign values soildata$spo_index <- values[match(soildata$spodint, index)] # convert new column from character to factor soildata$spo_index <- as.factor(soildata$spo_index) # Splitting into cal/val with 15% split idx <- createDataPartition(soildata$spo_index, p = 0.85, list = FALSE) val <- soildata[idx * -1, ] cal <- soildata[idx, ] nrow(cal) nrow(val) # build a classification tree model using Information index tree_class_model_info <- rpart(soildata$spo_index ~ rainfall + geology + aachn + dem10m + downslpgra + eastness + greenrefl + landsatb1 + landsatb2 + landsatb3 +landsatb7 + maxc100 + maxent + minc100 + mirref + ndvi+ northeastn + northness + northwestn + planc100 + proc100 + protection + relpos11 + slp50 + solar + tanc75, data = soildata, method = "class", parms = list(split = "information")) # inspect model outputs tree_class_model_info # predict landcover for the validation dataset tree_pred_info <- predict(tree_class_model_info, newdata = val, type = "class", datatype = "INT1U", progress = "text") # Evaluate model results using a confusion matrix results <- data.frame(val$spo_index, tree_pred_info) cm2 <- confusionMatrix(val$spo_index, tree_pred_info, positive = NULL, prevalence = NULL, mode = "everything") cm2 cm2$overall[1] ## Question ---- # which modeling method (gini or information) had a higher overall accuracy? kappa? # Part 4 - Build a Random Forest Model for classification with parameter tuning using gradient descent ---- # download csv from github url <- 'https://raw.githubusercontent.com/ncss-tech/stats_for_soil_survey/master/archive/chapters/8_Tree_models/IAD16_train.csv' download.file(url, "LCdata.csv") # import and wrangle training data LCtrain <- read.csv("LCdata.csv", header=TRUE) names(LCtrain) train <- LCtrain[, c(3:7,10)] str(train) train$Observed <- as.factor(train$Observed) table(train$Observed) ## build RF model from training data rf <- randomForest(as.factor(Observed) ~ ., data = train, ntree = 500, mtry = 2, importance = TRUE, proximity = TRUE, keep.forest = TRUE) rf # Grab OOB error matrix & take a look err <- rf$err.rate head(err) # examine model outputs plot(rf) legend(x = "right", legend = colnames(err), fill = 1:ncol(err)) #plot variable importance varImpPlot(rf) # Look at final OOB error rate (last row in err matrix) oob_err <- err[nrow(err), "OOB"] print(oob_err) ## Questions ---- # 1. What is the OOB Error # 2. What was the predictor with the greatest variable importance? # 3. What random forest parameters were used? ntrree? mtry? # Establish a list of possible values for mtry, nodesize and ntree mtry <- seq(2, 4, 1) nodesize <- seq(1, 8, 2) ntree <- seq(200, 800, 100) # Create a data frame containing all combinations hyper_grid <- expand.grid(mtry = mtry, nodesize = nodesize, ntree = ntree) # Create an empty vector to store OOB error values oob_err <- c() # Write a loop over the rows of hyper_grid to train the grid of models for (i in 1:nrow(hyper_grid)) { # Train a Random Forest model model <- randomForest(formula = Observed ~ ., data = train, mtry = hyper_grid$mtry[i], nodesize = hyper_grid$nodesize[i], ntree = hyper_grid$ntree[i]) # Store OOB error for the model oob_err[i] <- model$err.rate[nrow(model$err.rate), "OOB"] } # Identify optimal set of hyperparmeters based on OOB error opt_i <- which.min(oob_err) print(hyper_grid[opt_i,]) # Use this info to build a second random forest model and compare OOB # complete the code in the line below with the values from your grid search rf.opt <- randomForest(Observed ~ ., data = train, ntree = 100, mtry = 1, nodesize= 1, importance = TRUE, proximity = TRUE, keep.forest = TRUE) rf rf.opt ## Questions ---- # 1. How many models were testing using the gradient descent? # 2. What are the optimal parameters for ntree, mtry and nodesize? # 3. What was the net result on our out of bag error? # Part 5 - fast Random Forest with ranger ---- # using system time for benchmarking to compare model speed # benchmark the random forest modeling process system.time({ rf.opt <- randomForest(Observed ~ ., data=train, ntree = 500, mtry = 4, nodesize= 5, importance = TRUE, proximity = TRUE, keep.forest = TRUE) }) # benchmark the ranger implementation of the random forest modeling process system.time({ ranger.rf <- ranger(Observed ~ ., data=train) }) # benchmark the random forest modeling process with gradient descent system.time({for (i in 1:nrow(hyper_grid)) { # Train a Random Forest model model <- randomForest(formula = Observed ~ ., data = train, mtry = hyper_grid$mtry[i], nodesize = hyper_grid$nodesize[i], ntree = hyper_grid$ntree[i]) # Store OOB error for the model oob_err[i] <- model$err.rate[nrow(model$err.rate), "OOB"] }}) # benchmark the ranger implementation random forest modeling process with gradient descent # Establish a list of possible values for mtry, nodesize and ntree mtry <- seq(2, 4, 1) min.node.size <- seq(1, 8, 2) num.trees <- seq(100, 500, 100) # Create a data frame containing all combinations hyper_grid <- expand.grid(mtry = mtry, min.node.size = min.node.size, num.trees = num.trees) # Create an empty vector to store OOB error values oob_err <- c() system.time({ for (i in 1:nrow(hyper_grid)) { # Train a Random Forest model model <- ranger(formula = Observed ~ ., data = train, mtry = hyper_grid$mtry[i], min.node.size = hyper_grid$min.node.size[i], num.trees = hyper_grid$num.trees[i]) # Store OOB error for the model oob_err[i] <- model$err.rate[nrow(model$err.rate), "OOB"] } }) ## Questions ---- # 1. HOW MUCH FASTER IS THE RANGER IMPLEMENTATION for modeling?? # 2. HOW MUCH FASTER IS THE RANGER IMPLEMENTATION for tuning with gradient descent?? # Part 6 - Predict using random forest model to map landcover from Imagery ---- # get Sample Image from github and unzip url <- 'https://raw.githubusercontent.com/ncss-tech/stats_for_soil_survey/master/archive/chapters/8_Tree_models/NAIP_5m.zip' download.file(url, "NAIP_5m.zip") unzip("NAIP_5m.zip") # read and inspect imagery x1 <- brick("m_3807706_nw_5m.tif") names(x1) # build a function to calculate NDVI vi <- function(img, i, k){ bi <- img[[i]] bk <- img[[k]] vi <- (bk-bi)/(bk+bi) return (vi) } # run NDVI with the new function and plot ndvi1 <- vi(x1,1,4) plot(ndvi1) # Add NDVI back to the original image to create a 5-band image for predicting landcover x <- stack(x1, ndvi1) names(x) <- c('red', 'green', 'blue', 'nir', 'ndvi') # predict landcover for the image using the optimum random forest model from part 5 LC.class <- predict(x, rf.opt, type = "response", datatype = "INT1U", progress = "text") # plot output landcover map and write output to disc plot(LC.class) writeRaster(LC.class, "LC_Class_map.tif", overwrite = T, progress = "text") # generate a probability surface for the predictions LC.prob <- predict(x, rf.opt, type = "prob", datatype = "INT1U", options = c("COMPRESS=LZW"), progress = "text") # plot the surface of probabilities plot(LC.prob) # pull out the raster attribute table for ArcGIS attrib <- as.data.frame(LC.class@data@attributes) colnames(attrib) <- c("Value", "Class") # view the RAT attrib # write output table write.csv(attrib, "LC.class_rat.csv") # Question ----- # 1. Use the data to make a landcover map and attach a screenshot to the project worksheet # Part 7 - Feature selection using the Boruta R package ---- library(Boruta) yvar <- "Otot" xvars <- c("rainfall", "geology", "aachn", "dem10m", "downslpgra", "eastness", "greenrefl", "landsatb1", "landsatb2", "landsatb3", "landsatb7", "maxc100", "maxent", "minc100", "mirref", "ndvi", "northeastn", "northness", "northwestn", "planc100", "proc100", "protection", "relpos11", "slp50", "solar", "tanc75") # run the Boruta algorithm fs_bor <- Boruta(y = soildata$Otot, x = soildata[, xvars], maxRuns = 35, doTrace = 1) # plot variable importance and selected features plot(fs_bor) # plot evolution of the feature selection plotImpHistory(fs_bor) # extract the selected feature variables vars <- getSelectedAttributes(fs_bor) # view summary of the results View(attStats(fs_bor)) # Part 8 - Cross-validation of Random Forests ---- library(caret) tc <- trainControl( method = "cv", number = 10, returnResamp = "all", savePredictions = TRUE, search = "grid", verboseIter = FALSE ) tg <- expand.grid( min.node.size = 5, splitrule = "variance", mtry = 5 ) # fit model, automatically tune, and cross-validate dep_tr <- train(y = soildata$Otot, x = soildata[, xvars], method = "ranger", importance = "permutation", trControl = tc ) # fit model, automatically tune, and cross-validate on Boruta variables dep_tr_bor <- train(y = soildata$Otot, x = soildata[, vars], method = "ranger", importance = "permutation", trControl = tc ) # view summary of results dep_tr dep_tr_bor # variable importance plots vip::vip(dep_tr) vip::vip(dep_tr_bor) # extract final models dep_tr$finalModel dep_tr_bor$finalModel dep_cv <- subset(dep_tr_bor$resample, mtry == 2 & splitrule == "extratrees") summary(dep_cv) tidyr::pivot_longer(dep_cv, cols = c("RMSE", "Rsquared", "MAE")) |> ggplot(aes(x = value)) + geom_histogram(bins = 5) + facet_wrap(~ name, scales = "free_x") # Part 9 - Plot the Partial Effects ---- library(pdp) test <- ranger(Otot ~., data = soildata[c(yvar, vars)], min.node.size = 5, mtry = 2, splitrule = "extratrees" ) # plot effect of landsatb7 partial(test, pred.var = "landsatb7", plot = TRUE, rug = TRUE, smooth = TRUE, grid.resolution = 20) partial(test, pred.var = "landsatb7", plot = TRUE, rug = TRUE, smooth = TRUE, grid.resolution = 20, plot.engine = "ggplot2") + geom_point(data = soildata, aes(x = landsatb7, y = Otot)) # plot effect of maxent partial(test, pred.var = "maxent", plot = TRUE, rug = TRUE, smooth = TRUE, grid.resolution = 20) # plot the combined effect of landsatb7 and maxent partial(test, pred.var = c("landsatb7", "maxent"), plot = TRUE, rug = TRUE) # Part 10 - Plot model fit and residuals ---- # 1 to 1 line vs regression line soildata$pred <- predict(test, data = soildata)$predictions ggplot(soildata, aes(x = pred, y = Otot)) + geom_point() + geom_abline() + geom_smooth(method = "lm") # residuals soildata$res <- soildata$Otot - soildata$pred ggplot(soildata, aes(x = pred, y = res)) + geom_point() + geom_hline(aes(yintercept = 0)) + geom_smooth() ggplot(soildata, aes(x = pred, y = sqrt(res))) + geom_point() + geom_hline(aes(yintercept = 0)) + geom_smooth() ggplot(soildata, aes(sample = res)) + geom_qq() + geom_qq_line()