---
title: "Topic 2. Regression and Linear Models"
author: "EW (Jed) Frees"
date: "16 November 2024"
output: html_document
---
  
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

#  From Chapter 5 Multiple Linear Regression of the Online Tutorial

```{r eval = FALSE}

meps <- read.csv("https://assets.datacamp.com/production/repositories/2610/datasets/7b7dab6d0c528e4cd2f8d0e0fc7824a254429bf8/HealthMeps.csv", header = TRUE)

# Exercise 5.1.2
str(meps)
summary(meps)
table(meps$race)
par(mfrow = c(1, 2))
hist(meps$expendop, main = "", xlab = "outpatient expenditures")
hist(log(meps$expendop), main = "", xlab = "log expenditures")
par(mfrow = c(1, 1))
meps$logexpend <- log(meps$expendop)
boxplot(logexpend ~ phstat, data = meps, main = "boxplot of log expend")
plot(meps$age,meps$logexpend, xlab = "age", ylab = "log expend")
lines(lowess(meps$age, meps$logexpend), col="red")
```


```{r eval = FALSE}
# Exercise 5.1.3
# Split the sample into a `training` and `test` data
n <- nrow(meps)
set.seed(12347)
shuffled_meps <- meps[sample(n), ]
train_indices <- 1:round(0.75 * n)
train_meps    <- shuffled_meps[train_indices, ]
test_indices  <- (round(0.25 * n) + 1):n
test_meps     <- shuffled_meps[test_indices, ]

meps_mlr1 <- lm(expendop ~ gender + age + race + region + educ + phstat + mpoor + anylimit + income + insure + usc + unemploy + managedcare, data = train_meps)
summary(meps_mlr1)
par(mfrow = c(2, 2))
plot(x=meps_mlr1)

meps_mlr2 <- lm(logexpend ~ gender + age + race + region + educ + phstat + mpoor + anylimit + income + insure + usc + unemploy + managedcare, data = train_meps)
summary(meps_mlr2)
plot(meps_mlr2)

```