---
title: "Lab 6"
output: html_notebook
---

### Discrete predictive simulation
```{r}
# We know that the probability of that a baby is
# born a girl is 48.8%

# We can simulate 400 birth events using a binomial
# distribution:
n.girls = rbinom (1, 400, .488)
n.girls

# We could repeat this process 1000 times, to 
# get a sense of what this distribution would
# look like:
hist(rbinom(1000, 400, .488))

# Things become more complicated when we take into
# account that there are different types of births:
#     There is a 1/125 chance that a birth event 
#     results in fraternal twins
#          Of which each has an approximate 49.5%
#          chance of being a girl
#     There is a 1/300 chance of identical twins
#          Of which each has an approximate 49.5% 
#          chance of being girls
birth.type = sample(c("fraternal twin",
                      "identical twin",
                      "single birth"),
                      size = 400, replace = TRUE, 
                      prob = c(1/125, 1/300, 1 - 1/125 - 1/300))

girls = rep(NA, 400) # empty vector for girls

for (i in 1:400){
  
  if (birth.type[i] == "single birth"){
  
      girls[i] = rbinom (1, 1, .488)}

    else if (birth.type[i] == "identical twin"){
  
      girls[i] = 2 * rbinom (1, 1, .495)} 
  # Multiplied by 2, because if the twins are 
  # identical and girls then we have to add 2 girls
  
  else if (birth.type[i] == "fraternal twin"){
  
      girls[i] = rbinom(1, 2, .495)}
  
  # Number of trials is 2 because we could get a boy
  # and a girl fraternal twin
}

n.girls = sum(girls)
n.girls

# Now, we can repeat this process 1000 times to get a 
# sense of the uncertainty in this estimate:
n.sims = 1000

n.girls = rep(NA, n.sims)

for (s in 1:n.sims){

  for (i in 1:400){
    
    if (birth.type[i] == "single birth"){
      
      girls[i] = rbinom (1, 1, .488)}
    
    else if (birth.type[i] == "identical twin"){
      
      girls[i] = 2 * rbinom (1, 1, .495)} 
    
    else if (birth.type[i] == "fraternal twin"){
      
      girls[i] = rbinom(1, 2, .495)}
    
  }

    n.girls[s] <- sum(girls)
    
}

summary(n.girls)
sd(n.girls)
hist(n.girls)
```

Continuoous Predictive Simulation 
```{r}
## Continuous Predictive Simulation
###################################

# We can do the same thing with continous outcomes

sex = rbinom(10, 1, .52) # Selecting 10 at random
                         # 52% are female
sex

# Heights of men are distributed ~N(69.1, 2.9) in inches
# Heights of women are distributed ~N(63.7, 2.7) in inches
height = ifelse(sex == 0, 
                rnorm(10, 69.1, 2.9),
                rnorm(10, 63.7, 2.7))

avg.height = mean(height)
avg.height

# Now we can create a distribution of heights
# by repeating this 1000 times

n.sims = 1000
avg.height = rep(NA, n.sims)

for (s in 1:n.sims){
  
  sex = rbinom(10, 1, .52)
  height = ifelse(sex == 0, rnorm(10, 69.1, 2.9),
                  rnorm(10, 63.7, 2.7))
  avg.height[s] = mean(height)

  }


hist(avg.height, main = "Average height of 10 adults")

# Could also simulate for maximum heights: 
max.height = rep(NA, n.sims)

for (s in 1:n.sims){
  
  sex = rbinom(10, 1, .52)
  height = ifelse(sex == 0, rnorm(10, 69.1, 2.9),
                  rnorm(10, 63.7, 2.7))
  max.height[s] = max(height)
  
}

hist(max.height)


# It's generally cleaner to use functions in R...

# So we could just create a function to do this: 
Height.sim = function(n.adults, m.m, sd.m, m.f, sd.f){
  sex = rbinom(n.adults, 1, .52)
  height = ifelse(sex == 0, rnorm(n.adults, m.m, sd.m),
                    rnorm(n.adults, m.f, sd.f))
  mean(height)
}

avg.height = replicate(1000,
                        Height.sim(10, 69.5, 2.9, 64.5, 2.7))

hist(avg.height)
```

## Multilevel linear models in R
```{r}
wisc = read.csv('../../data/wisc3raw.csv', header = T)
head(wisc)
```
Number of samples and colums
```{r}
dim(wisc)
```
Summary of our variables
```{r}
summary(wisc)
```

Data preparation
```{r}
## Data Preparation
###################

# For lmer, data need to be read in long format,
# so we will start by reshaping the data:
long = reshape(wisc, 
               varying = list(c('verb1', 'verb2', 'verb4', 'verb6'),
                              c('perfo1', 'perfo2', 'perfo4', 'perfo6')),
               direction = 'long', timevar = 'time', idvar = 'id',
               v.names = c('verbal', 'perform'))

long = long[order(long$id), ]

head(long)
```
Because our time intervals are not equally spaced, I am also reassigning values for the time variable. 
```{r}
long$time = rep(c(1, 2, 4, 6), nrow(long)/4)
long$time_c = long$time - 1

head(long)
```
Plot the data
```{r}
# For this example, we will consider the performance
# factor of the WISC

# Start by plotting the raw data:
interaction.plot(long$time, # x-axis 
                 long$id, # grouping variable
                 long$perform, # y-axis
                 ylab = 'Performance', 
                 xlab = 'Grade', col = c(1:10), 
                 legend = F)
```

Computing Intraclass Correlation Coefficient
```{r}
# We will be using lmer() from the lme4 package

# The model is specified similar to the lm()
# function. We start by defining the outcome followed
# by the tilde (~). Next, we define predictors 
# separated by "+". Next, random effects need to be
# provided in brackets ( ). The random predictors
# will go inside the brackets, followed by the |
# symbol, then the grouping variable.

# Make sure lme4 is loaded into your R library
library(lme4)

# Begin by fitting an intercept only model to the
# data:

nullmod = lmer(perform ~ 1 + (1 | id), 
               data = long)
summary(nullmod)
```
Intraclass correlation coefficient (ICC)  

$p_{ic} = \frac{d_{11}}{d_{11}+\sigma^2}$

ICC ranges from 0 to 1.  

High ICC would indicate "high similarity between values from the same group"

```{r}
# ICC
# Variance for Individual divided by total variaance ( Id variance + residual variance)

29.65 / (29.65 + 230.92) # About 11% of variance in performanc
                         # due to differences 
                         # between individuals

230.92 / (29.65 + 230.92) # About 89% of variance
                          # due to differences 
                          # within individuals
```

```{r}
library(psych)
ICC(wisc[,c('perfo1', 'perfo2', 'perfo4', 'perfo6')])
```

Plot subjects 1-10
```{r}
long %>%
  filter(id < 11) %>%
  ggplot(aes(y = perform, x = factor(id), colour = factor(id))) +
  geom_point() +
  theme(legend.position="none") + 
  xlab("Subject ID") + 
  ylab("Performance")
```

```{r}
# Next, fit the unconditional linear growth
# model.

## WHAT ARE THE LEVEL 1 AND LEVEL 2 EQUATIONS FOR
## THE UNCONDITIONAL LINEAR GROWTH MODEL?

mod0 = lmer(perform ~ time_c + (time_c | id), 
            data = long)
summary(mod0)
```

```{r}
# Adding time-invariant individual-level covariate

# If we were to leave mother's education in its raw
# form, our interpretation for the fixed effects would
# be for individual's with mother's education equal
# to 0

# To make this interpretation more reasonable, instead
# we can grand-mean center mother's education:
long$momed_gm = long$momed - mean(long$momed)

# Now our fixed effects apply to average mothers'
# education in the sample.

# Things become more complicated for time-varying
# predictors!


# Let's fit the model, adding the covariate


## WHAT ARE THE LEVEL 1 AND LEVEL 2 EQUATIONS HERE?


# Random slope for time only
mod1 = lmer(perform ~ time + momed_gm + 
            (time | id), 
            data = long)
summary(mod1)
```

```{r}
# Random slope for time and momed only
mod2 = lmer(perform ~ time + momed_gm + 
            (time + momed_gm| id), 
            data = long)
summary(mod2)
```

```{r}
# Compare AICs:
AIC(nullmod); AIC(mod0); AIC(mod1); AIC(mod2)
```
```{r}
# You could then consider adding additional predictors
# in your model to help further reduce the residual
# variance in your model

plot(data = long, perform ~ time, type = 'n', 
     ylim = c(min(long$perform), max(long$perform)),
     xlim = c(min(long$time_c),max(long$time_c)),
     cex.main = .75,
     xlab = 'Grade', 
     ylab = "WISC Performance",
     main = "Variability in Slope and Intercepts- Individual")
```

```{r}
fix = fixef(mod1)
rand = ranef(mod1)
parmsIndiv = cbind((rand$id[1] + fix[1]), (rand$id[2] + fix[2]))


# Plot individual trajectories
for(i in 1:length(unique(long$id))){
  
  abline(a = parmsIndiv[i, 1], b = parmsIndiv[i, 2],
         col = 'lightblue')
  
  par = par(new=F)
  
}

# Adding mean trajectory
abline(a = fix[1], b = fix[2], lwd = 2,col = 'red')
```