# Load packages
library(tidyverse)
library(readr)
library(broom)
library(flextable)
library(ggpubr)

# Load data
full_data <- read_csv("../Data/Cross-sec_full.csv", na = c(".", "", " "))

# Rename variables to remove spaces and commas from names
names(full_data) <- gsub(pattern=" |,", replacement="_", names(full_data))

# Visualize data first for relationships of interest
ggplot(data=full_data, aes(x=V12_mullen_composite_standard_score, y=V12_aosi_total_score_1_18))+
  geom_point()+
  labs(x="Mullen Composite Standard Score", 
       y="AOSI Total Score", 
       title="Scatterplot of Mullen Composite Standard Score \nand AOSI Total Score at Month 12")+
  theme_classic()

# Calc correlation
cor(x=full_data$V12_mullen_composite_standard_score, y=full_data$V12_aosi_total_score_1_18)

# Missing values cause error by default
cor(x=full_data$V12_mullen_composite_standard_score, y=full_data$V12_aosi_total_score_1_18, 
    use="pairwise.complete")

# Test corr/compute 95% CI
cor.test(x=full_data$V12_mullen_composite_standard_score, y=full_data$V12_aosi_total_score_1_18)

# Model AOSI TS at 12 months as a linear fn of Mullen composite at 12 months and age at 12 month visit
lm(V12_aosi_total_score_1_18~V12_mullen_composite_standard_score+V12_mullen_Candidate_Age, data=full_data)

full_data %>%
  filter(is.na(V12_aosi_total_score_1_18)==0&is.na(V12_mullen_composite_standard_score)==0)

# Save results to access later if needed; use sumary fn to get usual results table
lm_fit <- lm(V12_aosi_total_score_1_18~V12_mullen_composite_standard_score+
               V12_mullen_Candidate_Age, data=full_data)
summary(lm_fit)

# Results from R in ugly format.  Easy way to reformat: tidy() from broom package
lm_fit_results_table <- tidy(lm_fit)

# Then use flextable to create output in format for Word for a manuscript
lm_fit_results_table <- lm_fit_results_table %>%
  mutate(p.value=ifelse(p.value<0.005, "<0.005", 
                        as.character(round(p.value, 3))),
         term=fct_recode(factor(term),
                         "Intercept"="(Intercept)",
                         "12 Month MSEL Composite SS"="V12_mullen_composite_standard_score",
                         "12 Month Visit Age"="V12_mullen_Candidate_Age"),
         model="MSEL Composite")

flextable(data=lm_fit_results_table) %>%
  set_header_labels(term="Variable",
                    "estimate"="Estimate",
                    "std.error"="Std. Error",
                    "statistic"="T Statistic",
                    "p.value"="P-value") %>%
  autofit()

# Let's look at all the output from the model in detail
lm_fit
summary(lm_fit)

# Let's look at another model and compare the results in one table
lm_fit_2 <-
  lm(V12_aosi_total_score_1_18~V12_mullen_cognitive_t_score_sum+V12_mullen_Candidate_Age, data=full_data)

lm_fit_results_table_2 <- tidy(lm_fit_2) %>%
  mutate(p.value=ifelse(p.value<0.005, "<0.005", 
                        as.character(round(p.value, 3))),
         term=fct_recode(factor(term),
                         "Intercept"="(Intercept)",
                         "12 Month MSEL Cognitive T-Score"="V12_mullen_cognitive_t_score_sum",
                         "12 Month Visit Age"="V12_mullen_Candidate_Age"),
         model="MSEL Cognitive")

lm_table_all <- rbind(lm_fit_results_table %>% select(model, everything()), 
                      lm_fit_results_table_2 %>% select(model, everything()))

# Use the fn colformat_double() to round to 2 places
# Use merge_v to remove duplicate model labels
# Use valign to vertically align the cells
# What happens if you remove merge_v, valign, fix_border_issues?
# Why were the p values also rounded to 2 places

flextable(data=lm_table_all) %>%
  colformat_double(digits=2) %>%
  set_header_labels("model"="Model",
                    "term"="Variable",
                    "estimate"="Estimate",
                    "std.error"="Std. Error",
                    "statistic"="T Statistic",
                    "p.value"="P-value") %>%
  merge_v(j=1) %>%
  valign(valign = "top") %>%
  autofit() %>%
  fix_border_issues()

# Let's do some diagnostics
# 1. Linear Fit
ggplot(data=full_data, aes(x=V12_mullen_composite_standard_score, y=V12_aosi_total_score_1_18))+
  geom_point()+
  geom_smooth(method="loess", se=FALSE)+
  labs(x="Mullen Composite Standard Score", 
       y="AOSI Total Score", 
       title="Scatterplot of Mullen Composite Standard Score \nand AOSI Total Score at Month 12")

# 2. Homoskedasticity
fit_data <- data.frame(lm_fit$residuals, lm_fit$fitted.values)

ggplot(data=fit_data, aes(y=lm_fit.residuals, x=lm_fit.fitted.values))+
  geom_point()+
  labs(x="Fitted Value", 
       y="Residual", 
       title="Scatterplot of residual by fitted value for AOSI regression model.")

# 3. Normality of residuals
qqnorm(y=fit_data$lm_fit.residuals)
qqline(y=fit_data$lm_fit.residuals, datax = FALSE)

# Noticable deviation, but large sample size?
dim(full_data)
# Doesn't reflect data actually used to FIT model (include missing values)
# Let's extract that data directly instead and look size
View(model.frame(lm_fit))
dim(model.frame(lm_fit))