############################################################## ### Title: Bootcamp example code ### Author: Magdalena Bennett ### Date Created: 08/23/2023 ### Last edit: [08/23/2023] - Created code ############################################################## #Clear memory rm(list = ls()) #Clear the console cat("\014") #Turn off scientific notation (turn back on with 0) options(scipen = 999) # Load packages library(tidyverse) #includes dplyr and ggplot2! # If there is a package you don't have installed, you can use install.packages("tidyverse") # Only run once! (no need to install packages every time you run your code) # Load data (this is loading data directly from Github) sales = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/bootcamp/data/US_Regional_Sales_Data.csv") ## Inspecting your data # Exercise 1: Let's explore the data. How many variables and observations do we have? What type of variables do we have? ## We can see in the environment pane that we have 7991 obs and 16 variables. Most of them are character variables (chr), but some are numeric (int and num). # Exercise 2: Install the package vtable, load it, and run the code sumtable(sales). What do you get? Use the ?vtable to see the options for this function. #install.packages(vtable) library(vtable) ?sumtable sumtable(sales) ## Data wrangling # Exercise 1: Unit cost and unit price should be numeric. Let's change this! (hint: you can use the function gsub() to replace "," for "", and as.numeric() to transform a variable!). ## Keep the same names for the variables and the dataset. sales = sales %>% mutate(unit_cost = gsub(",", "", unit_cost), unit_price = gsub(",","", unit_price)) sales = sales %>% mutate(unit_cost = as.numeric(unit_cost), unit_price = as.numeric(unit_price)) # Exercise 2: What are the different values for the sales channel in this dataset? Use the function table() to see! ## Create a new dataset for in-store and online sales. Call it "sales_min". How many variables do we have? sales %>% select(sales_channel) %>% table() sales_min = sales %>% filter(sales_channel == "In-Store" | sales_channel == "Online") # Exercise 3: Use the original dataset "sales", and create a new variable called "minority", ## which takes the value of 1 if the sales channel is in-store or online, and 0 in another case. sales = sales %>% mutate(minority = ifelse(sales_channel == "In-Store" | sales_channel == "Online", 1, 0)) # Exercise 4: What is the average price for sales made through a minority channel vs a non-minority channel? sales %>% group_by(minority) %>% summarize(unit_price = mean(unit_price)) ## Plotting data! # Exercise 1: Create a scatter plot between unit cost (x axis) and unit price (y axis) ggplot(data = sales, aes(x = unit_cost, y = unit_price)) + geom_point() # Exercise 2: Now, let's make that plot pretty. Use theme_minimal() to get rid of the grey background. Color the points with the color "deepskyblue3", ## and change the axis titles to something more informative (e.g. Unit price ($)). This can be done with xlab() and ylab(). ggplot(data = sales, aes(x = unit_cost, y = unit_price)) + geom_point(color = "deepskyblue3") + theme_minimal() + xlab("Unit Cost ($)") + ylab("Unit Price ($)") # Exercise 3: Using the same code as before, now we want to color observations from the minority sales channel in one color, and the non-minority in another color. ## Write some code that does that (e.g. you will need to change your aesthetics!) ggplot(data = sales, aes(x = unit_cost, y = unit_price, color = factor(minority))) + geom_point() + theme_minimal() + xlab("Unit Cost ($)") + ylab("Unit Price ($)") # Exercise 4: Finally, using the same code as in exercise 2, include a regression line in this plot using geom_smooth(). ggplot(data = sales, aes(x = unit_cost, y = unit_price)) + geom_point(color = "deepskyblue3") + theme_minimal() + xlab("Unit Cost ($)") + ylab("Unit Price ($)") + geom_smooth(method = "lm") ## Regressions # Let's load a new dataset: The Gapminder gapminder = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/bootcamp/data/gapminder.csv") # Exercise 1: What type of data do we have? ## You can see there is numeric data, but also factor (also referred to as categorical variables). ## Factors are useful because they enter a regression as individual dummies # Exercise 2: Transform population into millions (divide pop by 10^6), and then regress life expectancy on gdp per capita and population. What do you obtain? gapminder = gapminder %>% mutate(pop = pop/10^6) lm1 = lm(lifeExp ~ gdpPercap + pop, data = gapminder) summary(lm1) # Exercise 3: Include now continent in the previous regression. Do your results change? How does it look when you include a factor variable in a regression? lm2 = lm(lifeExp ~ gdpPercap + pop + continent, data = gapminder) summary(lm2) ## Bringing everything together # Exercise 1: Create a new variable called gdpPercap_log, which is the logarithm of the GDP per capita. Now plot life expectancy against the log(GDP per capita), ## and describe the relationship. gapminder = gapminder %>% mutate(gdpPercap_log = log(gdpPercap)) ggplot(data = gapminder, aes(x = gdpPercap_log, y = lifeExp)) + geom_point() + theme_minimal() # Exercise 2: Using the same plot as before, now color the points by continent and make the size proportional by population (in millions). ggplot(data = gapminder, aes(x = gdpPercap_log, y = lifeExp, size = pop, color = continent)) + geom_point() + theme_minimal() # Exercise 3: Do the same thing as before (exercise 2), but only for Europe! gapminder %>% filter(continent == "Europe") %>% ggplot(data = ., aes(x = gdpPercap_log, y = lifeExp, size = pop)) + geom_point() + theme_minimal() # Exercise 4: Finally, run a regression that helps you estimate the association between life expectancy and GDP per capita, conditional on population, ## for the year 2007 and then, another regression for the year 1982. lm3 = gapminder %>% filter(year == 2007) %>% lm(lifeExp ~ gdpPercap + pop, data = .) summary(lm3) lm4 = gapminder %>% filter(year == 1982) %>% lm(lifeExp ~ gdpPercap + pop, data = .) summary(lm4)