--- title: "Lab 07" author: "PPOL 670" date: "`r Sys.Date()`" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval = TRUE) #install.packages("tinytex") #tinytex::install_tinytex() ``` ## R Markdown Details ```{r, echo = T, eval = T, message=F, warning=F} x <- 15 y <- 20 x + y ``` Additionally, there are a few other things we can do in the "text" portion of Markdown. For example, we can add footnotes^[Such as the footnote here!] and also add hyperlinks. We can add hyperlinks to external websites like [this](www.google.com), by including any website url like http://ppol670.alexanderpodkul.com/syllabus.html, and by referencing different parts of this document such as [Splitting Data]! - We can also - Create bullet points - Like this ## Loading Data ```{r, echo = T, eval = T, message=F, warning=F} library(dplyr) url <- 'https://raw.githubusercontent.com/apodkul/ppol670_01/main/Data/life_expect.csv' ctry_data <- read.csv(url) ctry_data %>% glimpse() ``` ## Creating a Table ```{r} ctry_data %>% group_by(Continent) %>% summarize(`Median Life Expectancy` = median(life_expectancy)) ``` ```{r} #install.packages('kableExtra') library(kableExtra) # HTML documentation: https://haozhu233.github.io/kableExtra/awesome_table_in_html.html # PDF documentation: https://haozhu233.github.io/kableExtra/awesome_table_in_pdf.pdf ctry_data %>% group_by(Continent) %>% summarize(`Median Life Expectancy` = median(life_expectancy)) %>% kbl() ``` ```{r} ctry_data %>% group_by(Continent) %>% summarize(`Median Life Expectancy` = median(life_expectancy)) %>% kbl(caption = 'Median Life Expectancy by Continent', position = 'H') %>% kable_styling(latex_options = c('striped')) ``` ## Working with Factors ```{r} ctry_data$Continent2 <- factor(ctry_data$Continent) library(forcats) fct_infreq(ctry_data$Continent2) %>% levels() fct_collapse(ctry_data$Continent2, 'Americas' = c('North America', 'South America')) %>% levels() %>% fct_infreq() fct_expand(ctry_data$Continent2, 'Antarctica') %>% levels() #Useful for a few reasons but... library(ggplot2) ctry_data %>% ggplot(aes(x = Continent2)) + geom_bar() ctry_data %>% mutate(Continent2 = fct_collapse( Continent2, 'Americas' = c('North America', 'South America'))) %>% ggplot(aes(x = Continent2)) + geom_bar() ``` ## Other Preprocessing Tasks ### Scaling ```{r, eval = F} ctry_data <- ctry_data %>% mutate(life_expect_scaled = scale(life_expectancy)) library(ggplot2) ggplot(ctry_data, aes(x = life_expectancy)) + geom_histogram() ggplot(ctry_data, aes(x = life_expect_scaled)) + geom_histogram() ``` ### Setting Up Dichotomous Variables ```{r, eval = F, echo = F} #Base R ctry_data %>% mutate(Europe = ifelse(Continent == 'Europe', 1, 0)) #Base R factors ctry_data %>% mutate(Europe = ifelse(Continent == 'Europe', 1, 0)) %>% mutate(Europe = factor(Europe, levels = c(0,1), labels = c('Not Europe', 'Europe'))) ``` ### Simple Imputation ```{r, eval = F} ctry_data <- read.csv(url) ctry_data$GDP_per_capita[2:4] <- NA #install.packages('caret') library(caret) model <- preProcess(x = ctry_data, method = 'medianImpute') predict(model, ctry_data) ``` ## Splitting Data ```{r} library(caret) set.seed(1789) trainIndex <- createDataPartition(1:nrow(ctry_data), p = .8, list = F, times = 1) training_ctry <- ctry_data[trainIndex,] testing_ctry <- ctry_data[-trainIndex,] # alternatively... sample(x = 1:nrow(ctry_data), size = nrow(ctry_data)*.8, replace = F) ```