---
title: "Examples Using the rms Package"
author: "D.E. Beaudette"
date: "`r Sys.Date()`"
output:
  html_document:
  mathjax: null
jquery: null
smart: no
---

```{r setup, echo=FALSE, results='hide', warning=FALSE}
# setup
library(knitr, quietly=TRUE)
library(kableExtra, quietly=TRUE)
opts_chunk$set(message=FALSE, warning=FALSE, background='#F7F7F7', fig.align='center', fig.retina=2, dev='png', tidy=FALSE, verbose=FALSE)
options(width=100, stringsAsFactors=FALSE)
```


# CA790 MAST Model

yosemite-mast-model.png: Predicted mean annual soil temperatures at 50cm depth, Yosemite Valley, CA; warmer colors are higher values, cooler colors are lower values. Predictions are based on 39 long-term soil temperature sensor installations. This model explains 81% of the variance in mean annual soil temperature observations.

yosemite-mast-model-effects.png: Effect sizes (model coefficients) associated with the Yosemite mean annual soil temperature model can be interpreted as: 

  1. an approximate decrease in MAST of 5 deg. C moving from 1500m to 2400m elevation 
  2. an approximate increase in MAST of 0.5 deg. C by moving from an "average" cool slope to warm slope
  3. an approximate decrease in MAST of 3 deg. C moving into cold-air drainage zones
  4. an approximate decrease in MAST by adding 5cm of O horizon material to a site that would otherwise have no O horizon.


**TODO**

  * compare sampled vs. exhaustive GIS data
  * check for redundancy
  * more documentation of model
  * compare multiple models


## Load and prep data for model-fitting
```{r load-prep-data, results='hide'}
library(car)
library(rms)
library(Hmisc)
library(RColorBrewer)
library(latticeExtra)
library(stringr)
library(sp)
library(corrplot)
library(visreg)

# default colors
cols <- brewer.pal(n=8, name='Set1')

## re-make cached data
# source('prepare-data.R')

# load cached data
load('cached-data.rda')

# down-grade to data.frame
m <- as.data.frame(s)

# add factor for O horizon presence
m$o.hz <- factor((m$o.hz.thick > 1))

# important !! reset row names
row.names(m) <- 1:nrow(m)
```


## Initial Data Exploration
```{r data-exploration, fig.height=8, fig.width=8}
vars <- c('MAST', 'Winter', 'Summer', 'elev', 'solar', 'tci', 'o.hz.thick')

# univariate summaries
summary(m[, vars])

# scatter plot matrix
scatterplotMatrix(m[, vars], col='black', 
                  regLine = list(col='orange'), 
                  smooth = list(col.spread='royalblue', col.smooth='royalblue', lwd.smooth=2, lty.smooth=1, lty.spread=3, lwd.spread=1)
                  )

# Spearman Rank correlation matrix
cor.mat <- round(cor(m[, vars], use='complete', method = 'spearman'), 2)

corrplot.mixed(cor.mat)
```

```{r fig.height=5, fig.width=6}
vc <- varclus(as.matrix(m[, vars]), similarity = 'spearman', method='complete')

par(mar=c(1,4.5,1,1), las=1)
plot(vc)
```

```{r, fig.width=6, fig.height=3}
# O horizon presence vs. other vars
bwplot(o.hz ~ elev, data=m, par.settings=tactile::tactile.theme())
bwplot(o.hz ~ solar, data=m, par.settings=tactile::tactile.theme())
bwplot(o.hz ~ tci, data=m, par.settings=tactile::tactile.theme())
```


## Investigate Influencial Obs
```{r find-outliers, fig.height=8, fig.width=8}

# build prelim model, plot diagnostics
l <- lm(MAST ~ elev + solar + tci + o.hz, data=m)
summary(l)

leveragePlots(l, las=1)

# standardized dfbeta
dfbetasPlots(l, id.n=2, las=1)

# ID outliers based on common metrics, and remove
out.row.names <- as.integer(attr(outlierTest(l, cutoff=0.05)$rstudent, 'names'))
# row names are not the same as row indexes in the presence of missing data!
out.idx <- which(row.names(m) %in% out.row.names)

# manully mark for removal from model
# out.idx <- c(18, 22)

# check
m[out.idx, ]

# save a copy and leave them out from original data
m.outliers <- m[out.idx, ]
m.sub <- m[-out.idx, ]
```

## Fit Model
```{r fit-model, fig.height=4, fig.width=14}
# setup labels for RMS plotting functions
label(m.sub$elev) <- 'Elevation (m)'
label(m.sub$solar) <- 'Ann. Beam Radiance (MJ/sq.m)'
label(m.sub$tci) <- 'Saga Wetness Index'
label(m.sub$o.hz.thick) <- 'O horizon thickness (cm)'
label(m.sub$o.hz) <- 'O horizon thickness > 1 cm'
label(m.sub$MAST) <- 'Mean Annual Soil Temperature (deg. C)'

# model with rms functions
dd <- datadist(m.sub)
options(datadist="dd")

# fit model, MAST is in deg C
(l <- ols(MAST ~ rcs(elev, 3) + solar + o.hz + tci, data=m.sub, x=TRUE, y=TRUE, weights = m.sub$complete.yrs / max(m.sub$complete.yrs)))

# visual check: partial effects make sense
plot(Predict(l), as.table=TRUE, layout=c(4,1))

# better visualization of partial effects + residuals
par(mfcol=c(1, 4))
visreg(l)

## see 
# Hmisc:::Function(l.ols)
```

## Examination of Residuals and Influential Observations
From the `residuals.ols` manual page:

 * "ordinary" refers to the usual residual
 * "score" is the matrix of score residuals (contributions to first derivative of log likelihood). 
 * "dfbeta" and "dfbetas" mean respectively the raw and normalized matrix of changes in regression coefficients after deleting in turn each observation. The coefficients are normalized by their standard errors. 
 * "hat" contains the leverages, diagonals of the "hat" matrix. 
 * dffit and dffits contain respectively the difference and normalized difference in predicted values when each observation is omitted. The S lm.influence function is used. 
 * "hscore": the ordinary residuals are divided by one minus the corresponding hat matrix diagonal element to make residuals have equal variance.
  
```{r}

# assumption of homogenaity: constant variance
r <- residuals(l, type = 'ordinary')
plot(r ~ fitted(l), las=1)
abline(h=0, lty=2, col='red')


# cutoff: change of x-fraction standard errors after removal of an observation
show.influence(which.influence(l, cutoff = 0.5), m.sub)

# how can I interpret these?
dffits <- residuals(l, type = 'dffit')
hist(dffits)

# dfbetas: then what?
dfb <- residuals(l, type = 'dfbetas')
head(dfb)

# from section 7.4 Checking Distributional Assumptions (Harrell, 2001)
# whats up with the axis labels?
r <- resid(l)
xYplot(r ~ fitted(l), method='quantile')

# QQ plots stratified...
qqmath(~ r | m.sub$o.hz)

```


## Visual Evaluation of Model Fit
```{r vis-eval-model, fig.height=8, fig.width=8}
# view slices of elevation partial effect at several O horizon thickness
plot(Predict(l, elev=NA, o.hz=c('FALSE', 'TRUE')))

plot(Predict(l, elev=NA, tci=c(4, 6, 15)))

plot(Predict(l, elev=NA, solar=c(63000, 74450)))
```

```{r vis-eval-model-2, fig.height=5, fig.width=10}
# view effects over each IQR
plot(
  summary(l, vnames='labels', 
          elev=c(1500, 2400), 
          solar=c(63000, 74450), 
          tci=c(9, 15), 
          o.hz=c('FALSE', 'TRUE')
  ), 
  cex=1, main='Effect sizes, over specified ranges, adjusted to median values.'
)


# cross-validation

# diagnostics:
anova(l)
# since we are using both the rms and car packages, need to specify which vif()
rms::vif(l) 

# eval mean-absolute-error (original units of data set)
(mast.MAE <- round(mean(abs(m.sub$MAST - predict(l, m.sub)), na.rm=TRUE), 2))

# save fitted / residuals to point data
m.sub$resid <- resid(l)
m.sub$fitted <- fitted(l)
m.sub$r_f <- with(m.sub, abs(resid)/fitted)

# # prepare point data spatial output with model diagnostics
# s <- h
# s@data <- s@data[, 'site_name', drop=FALSE]
# s@data <- join(s@data, m, by='site_name', type='left')
```


```{r eval-model, fig.height=6, fig.width=6}
plot(predict(l, m.sub) ~ m.sub$MAST, ylab='Predicted MAST (deg C)', xlab='Measured MAST (deg C)', xlim=c(0,20), ylim=c(0,20), las=1, cex=sqrt(m.sub$complete.yrs/2), sub='Symbol Size Proportional to Weight')
points(predict(l, m.outliers) ~ m.outliers$MAST, col='red', pch=16)
text(m.outliers$MAST, predict(l, m.outliers), labels=m.outliers$name, pos=3, cex=0.75)
abline(0, 1, col='blue')
abline(v=c(8, 15), h=c(8, 15), lty=2, col='grey')
legend('topleft', legend=c('1:1 Line', 'Outliers'), lty=c(1, NA), pch=c(NA, 16), col=c('blue', 'red'), bty='n')
legend('bottomright', legend=paste('mean abs. error:', mast.MAE, 'deg. C'), cex=0.75, bty='n', inset=0.01)
```


## Estimate MAST at Each Grid Cell 
```{r apply-model, results='hide', eval=FALSE}
# model SE requires a helper function
predfun <- function(model, data) { cbind(se=predict(model, data, se.fit=TRUE)$se.fit) }

# since we don't have a raster depicting O horizon thickness, we will have to simulate it
# simplest approach is to make several predictions at fixed O hz thickness

# iterate over possible set of O hz thickness: from summary stats
o.hz.thick <- c(0, 2, 4)
for(i in o.hz.thick) {
  # make a fake O horizon thickness raster
  r.o.hz <- raster(r.stack)
  r.o.hz <- setValues(r.o.hz, values = i)
  names(r.o.hz) <- 'o.hz.thick'
  # make new stack
  r.stack.pred <- stack(r.stack, r.o.hz)
  
  # make file names
  f.pred <- paste0('spatial_data/mast-model-', i)
  f.pred.se <- paste0('spatial_data/mast-model-se-', i)
  
  # MAST model: estimates and SE
  predict(r.stack.pred, l, progress='text', filename=f.pred, datatype='FLT4S', format='GTiff', overwrite=TRUE)
  predict(r.stack.pred, l, fun=predfun, progress='text', filename=f.pred.se, datatype='FLT4S', format='GTiff', overwrite=TRUE)
}

# save model diagnostics as SHP
# can't convert labeled columns, have to do manually
s$elev <- as.numeric(s$elev)
s$solar <- as.numeric(s$solar)
s$tci <- as.numeric(s$tci)
s$o.hz.thick <- as.numeric(s$o.hz.thick)
s$MAST <- as.numeric(s$MAST)
s$resid <- as.numeric(s$resid)
s$fitted <- as.numeric(s$fitted)
s$r_f <- as.numeric(s$r_f)
writeOGR(s, dsn='spatial_data', layer='point-data', driver='ESRI Shapefile', overwrite_layer=TRUE)
```