---
title: "Lab 07"
author: "PPOL 670"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = TRUE)

#install.packages("tinytex")
#tinytex::install_tinytex()
```

## R Markdown Details 
```{r, echo = T, eval = T, message=F, warning=F}
x <- 15
y <- 20 
x + y
```

Additionally, there are a few other things we can do in the "text" portion of Markdown. For example, we can add footnotes^[Such as the footnote here!] and also add hyperlinks. We can add hyperlinks to external websites like [this](www.google.com), by including any website url like http://ppol670.alexanderpodkul.com/syllabus.html, and by referencing different parts of this document such as [Splitting Data]! 

- We can also 
- Create bullet points
- Like this


## Loading Data 
```{r, echo = T, eval = T, message=F, warning=F}
library(dplyr)

url <- 'https://raw.githubusercontent.com/apodkul/ppol670_01/main/Data/life_expect.csv'
ctry_data <- read.csv(url)

ctry_data %>% 
  glimpse()

```

## Creating a Table 
```{r}
ctry_data %>% 
  group_by(Continent) %>% 
  summarize(`Median Life Expectancy` = median(life_expectancy))
```

```{r}
#install.packages('kableExtra')
library(kableExtra)
# HTML documentation: https://haozhu233.github.io/kableExtra/awesome_table_in_html.html
# PDF documentation: https://haozhu233.github.io/kableExtra/awesome_table_in_pdf.pdf

ctry_data %>% 
  group_by(Continent) %>% 
  summarize(`Median Life Expectancy` = median(life_expectancy)) %>% 
  kbl()
```

```{r}
ctry_data %>% 
  group_by(Continent) %>% 
  summarize(`Median Life Expectancy` = median(life_expectancy)) %>% 
  kbl(caption = 'Median Life Expectancy by Continent', position = 'H') %>% 
  kable_styling(latex_options = c('striped'))
```

## Working with Factors
```{r}
ctry_data$Continent2 <- factor(ctry_data$Continent)

library(forcats)
fct_infreq(ctry_data$Continent2) %>%
  levels()

fct_collapse(ctry_data$Continent2, 
             'Americas' = c('North America', 
                            'South America')) %>%
  levels() %>%
  fct_infreq()

fct_expand(ctry_data$Continent2, 'Antarctica') %>%
  levels()

#Useful for a few reasons but...
library(ggplot2)
ctry_data %>%
  ggplot(aes(x = Continent2)) + 
  geom_bar()

ctry_data %>%
  mutate(Continent2 = fct_collapse(
    Continent2, 
    'Americas' = c('North America', 
                   'South America'))) %>%
  ggplot(aes(x = Continent2)) + 
  geom_bar()

```

## Other Preprocessing Tasks 
### Scaling 
```{r, eval = F}
ctry_data <- ctry_data %>% 
  mutate(life_expect_scaled = scale(life_expectancy))

library(ggplot2)
ggplot(ctry_data, aes(x = life_expectancy)) + 
  geom_histogram()

ggplot(ctry_data, aes(x = life_expect_scaled)) + 
  geom_histogram()

```


### Setting Up Dichotomous Variables 
```{r, eval = F, echo = F}
#Base R
ctry_data %>%
  mutate(Europe = ifelse(Continent == 'Europe', 1, 0))

#Base R factors 
ctry_data %>%
  mutate(Europe = ifelse(Continent == 'Europe', 1, 0)) %>%
  mutate(Europe = factor(Europe, 
                         levels = c(0,1), 
                         labels = c('Not Europe', 'Europe')))
```

### Simple Imputation
```{r, eval = F}
ctry_data <- read.csv(url)
ctry_data$GDP_per_capita[2:4] <- NA

#install.packages('caret')
library(caret)
model <- preProcess(x = ctry_data, 
               method = 'medianImpute')
predict(model, ctry_data)
```

## Splitting Data 
```{r}
library(caret)
set.seed(1789)
trainIndex <- createDataPartition(1:nrow(ctry_data), 
                                  p = .8, list = F,
                                  times = 1)

training_ctry <- ctry_data[trainIndex,]
testing_ctry <- ctry_data[-trainIndex,]

# alternatively...
sample(x = 1:nrow(ctry_data), 
       size = nrow(ctry_data)*.8,
       replace = F)

```