#====================================================================#
# Title: Day 2                                                       #
# Name:  Your name here                                              #
#====================================================================#

#### Code ------------------------------------------------------------
# Run through this code on your own to practice what we covered in
# the slides. Today I've put it all in one script, so keep track of 
# where you are! You can always restart R and rerun to start from 
# scratch if you've messed anything up.

# Before you do anything, make sure you're starting in a fresh session.
# Restart R by going to Session -> Restart R
# You can make it so that this also clears your workspace (highly recommended)
# by going to Tools -> Global Options -> General -> Basic -> Workspace
# and unclicking "restore .RData" and switching the other option to "never"

# Then we need to reload packages and data (same as last week)
library(tidyverse)
nlsy <- read_csv("nlsy_cc.csv")
colnames(nlsy) <- c("glasses", "eyesight", "sleep_wkdy", "sleep_wknd",
                    "id", "nsibs", "samp", "race_eth", "sex", "region", 
                    "income", "res_1980", "res_2002", "age_bir")

#### mutate -----------------------------------------------------
nlsy <- mutate(nlsy,
  region_factor = factor(region),
  age_bir_cent = age_bir - mean(age_bir),
  dataset = "NLSY"
)

nlsy_new <- mutate(nlsy,
  age_bir_cent = age_bir - mean(age_bir),
  age_bir_stand = age_bir_cent / sd(age_bir_cent)
)

nlsy <- mutate(nlsy, slp_cat_wkdy = case_when(
  sleep_wkdy < 5 ~ "little",
  sleep_wkdy < 7 ~ "some",
  sleep_wkdy < 9 ~ "ideal",
  sleep_wkdy < 12 ~ "lots",
  TRUE ~ NA_character_
))
# note that table doesn't show NAs! can be dangerous!
table(nlsy$slp_cat_wkdy, nlsy$sleep_wkdy)

nlsy <- mutate(nlsy, total_sleep = case_when(
  sleep_wknd > 8 & sleep_wkdy > 8 ~ 1,
  sleep_wknd + sleep_wkdy > 15 ~ 2,
  sleep_wknd - sleep_wkdy > 3 ~ 3,
  TRUE ~ NA_real_
))

#### Exercises 1 -----------------------------------------------------
# Answer the questions below. Remember to include your answers in
# this script so you can save and look back at it!

# 1. Using the NLSY data and `mutate()`, make a standardized (centered at the
# mean, and divided by the standard deviation) version of income.
# Answer:
nlsy <- mutate(nlsy,
  income_cent = income - mean(income),
  income_stand = income_cent / sd(income_cent)
)
summary(nlsy$income_stand)

# 2. Do the same thing, but using income on the log scale. Look at this
# variable using `summary()`. Can you figure out what happened? (Hint: look at
# log(income).)
# Answer:
nlsy <- mutate(nlsy,
  log_income_cent = log(income) - mean(log(income)),
  log_income_stand = log_income_cent / sd(log_income_cent)
)
summary(nlsy$log_income_stand)
# Some of the values of income were 0, so they were transformed to -Inf
# When we tried to take the mean of everything including those values,
# we get -Inf and -Inf - -Inf = NaN (not a number) according to R,
# then we can't take the standard deviation of that 

# 3. Redo question 2, but if you are not able to calculate log(income) for an
# observation, replace it with a missing value (using `case_when()`). This
# time, when you standardize log(income), you'll have to use `na.rm = TRUE` to
# remove missing values both when you take the mean and the standard deviation.
# Answer:

nlsy <- mutate(nlsy,
  log_income = case_when(
    income > 0 ~ log(income),
    TRUE ~ NA_real_
  ),
  log_income_cent = log_income - mean(log_income, na.rm = TRUE),
  log_income_stand = log_income_cent / sd(log_income_cent, na.rm = TRUE)
)
summary(nlsy$log_income_stand)

#### factors -----------------------------------------------------

nlsy <- mutate(nlsy, slp_cat_wkdy_ord = fct_relevel(
  slp_cat_wkdy,
  "little", "some", "ideal", "lots"
))

# misspelling
nlsy <- mutate(nlsy, slp_cat_wkdy_ord2 = fct_relevel(
  slp_cat_wkdy,
  "little", "same", "ideal", "lots"
))

nlsy <- mutate(nlsy,
  region_fact = factor(region),
  region_fact = fct_recode(region_fact,
    "Northeast" = "1",
    "North Central" = "2",
    "South" = "3",
    "West" = "4"
  )
)

nlsy <- mutate(nlsy, region_fact = fct_infreq(region_fact))

nlsy <- mutate(nlsy, region_fact = fct_rev(region_fact))

nlsy <- mutate(nlsy,
  slp_cat_wkdy_out =
    fct_explicit_na(slp_cat_wkdy, na_level = "outlier")
)

nlsy <- mutate(nlsy,
  slp_cat_wkdy_comb =
    fct_collapse(slp_cat_wkdy,
      "less" = c("little", "some"),
      "more" = c("ideal", "lots")
    )
)

nlsy <- mutate(nlsy, slp_cat_wkdy_lump = fct_lump(slp_cat_wkdy, n = 2))

#### Exercises 2 -----------------------------------------------------

# 1. Turn the eyesight variable into a factor variable. The numbers 1-5
# correspond to excellent, very good, good, fair, and poor. Make sure that
# categories are in an appropriate order.
# Answer:
nlsy <- mutate(nlsy,
  eyesight_fact = factor(eyesight),
  eyesight_fact = fct_recode(eyesight_fact,
    "Excellent" = "1",
    "Very Good" = "2",
    "Good" = "3",
    "Fair" = "4",
    "Poor" = "5"
  )
)
summary(nlsy$eyesight_fact)

# 2. Use two different methods to combine the worst two categories of eyesight
# into one category.
# Answer:
nlsy <- mutate(nlsy,
  eyesight_worst = fct_collapse(eyesight_fact,
    "Worst" = c("Fair", "Poor")
  )
)
summary(nlsy$eyesight_worst)
# can also use fct_lump since I know the two worst are the 
# two smallest categories
nlsy <- mutate(nlsy,
  eyesight_worst2 = fct_lump(eyesight_fact,
    n = 3,
    other_level = "Worst"
  )
)
summary(nlsy$eyesight_worst2)

# 3. Make a new categorical income variable with at least 3 levels (you can
# choose the cutoffs). Make a bar graph with this new variable where the bars
# are in the correct order from low to high and are colored increasingly dark
# shades of green. (Hint: http://colorbrewer2.org; `scale_color_brewer()`)
# Answer:

#### select -----------------------------------------------------

nlsy_subs <- select(nlsy, id, income, eyesight, sex, region)
nlsy_subs

select(nlsy_subs, -id, -region)

cols_I_want <- c("age_bir", "nsibs", "region")
select(nlsy, one_of(cols_I_want))

select(nlsy, starts_with("slp"))

select(nlsy, id, everything())

#### Exercises 3 -----------------------------------------------------

# 1. Create mean-centered versions of "age_bir", "nsibs", "income", and the two
# sleep variables. Use the same ending (e.g., "_cent") for all of them. Then
# make a new dataset of just the centered variables using `select()` and a
# helper.
# Answer:
nlsy <- mutate(nlsy,
               age_bir_cent = age_bir - mean(age_bir),
               nsibs_cent = nsibs - mean(nsibs),
               income_cent = income - mean(income),
               sleep_wkdy_cent = sleep_wkdy - mean(sleep_wkdy),
               sleep_wknd_cent = sleep_wknd - mean(sleep_wknd))

cent_vars <- select(nlsy, ends_with("cent"))
cent_vars

# 2. You may have added a lot of variables to the original dataset by now.
# Create a dataset called `nlsy_orig` that contains only the variables we
# started off with, using the vector of names we originally used to name the
# columns and the `one_of()` helper.
# Answer:
colnames_orig <- c("glasses", "eyesight", "sleep_wkdy", "sleep_wknd",
                   "id", "nsibs", "samp", "race_eth", "sex", "region", 
                   "income", "res_1980", "res_2002", "age_bir")
nlsy_orig <- select(nlsy, one_of(colnames_orig))
nlsy_orig

# 3. Look at `help(select)`. You'll notice that `rename()` is a related
# function. Looking at the examples to help, rename "age_bir" to
# "age_1st_birth" without making a new column.
# Answer:
nlsy <- rename(nlsy, age_1st_birth = age_bir)
nlsy

#### filter -----------------------------------------------------

wear_glasses <- filter(nlsy, glasses == 1)

yesno_glasses <- filter(nlsy, glasses == 0, glasses == 1)

glasses_great_eyes <- filter(nlsy, glasses == 1, eyesight == 1)

extreme_eyes <- filter(nlsy, eyesight == 1 | eyesight == 5)

some_regions <- filter(
  nlsy,
  region_fact == "Northeast" |
    region_fact == "South"
)

more_regions <- filter(nlsy, region_fact %in%
  c("South", "West", "Northeast"))

# other examples... play around with it!
7 %in% c(4, 6, 7, 10)
5 %in% c(4, 6, 7, 10)
!7 %in% c(4, 6, 7, 10)
!5 %in% c(4, 6, 7, 10)

northcentralers <- filter(
  nlsy,
  !region_fact %in% c("South", "West", "Northeast")
)

# compare
my_data <- filter(
  nlsy,
  age_bir_cent < 1,
  sex != 1,
  nsibs %in% c(1, 2, 3),
  !is.na(slp_cat_wkdy)
)
summary(select(my_data, age_bir_cent, sex, nsibs, slp_cat_wkdy))

oth_dat <- filter(
  nlsy,
  (age_bir_cent < 1) &
    (sex != 1 | nsibs %in% c(1, 2, 3)) &
    !is.na(slp_cat_wkdy)
)
summary(select(oth_dat, age_bir_cent, sex, nsibs, slp_cat_wkdy))

#### Exercises 4 -----------------------------------------------------

# 1. Create a dataset with all the observations that get over 7 hours of sleep
# on both weekends and weekdays *or* who have an income greater than/equal to
# 20,000 and less than/equal to 50,000.
# Answer:
newdat <- filter(nlsy,
                 (sleep_wkdy > 7 & sleep_wknd > 7) |
                   (income >= 20000 & income <= 50000))


# 2. Create a dataset that consists *only* of the missing values in
# `slp_cat_wkdy`. Check how many rows it has (there should be 3!).
# Answer:
newdat2 <- filter(nlsy, is.na(slp_cat_wkdy))
nrow(newdat2)

# 3. Look up the `between()` function in help. Figure out how to use this to
# answer question 1, when choosing people whose income is between 20,000 and
# 50,000. Check to make sure you get the same number of rows.
# Answer:
newdat3 <- filter(nlsy,
                 (sleep_wkdy > 7 & sleep_wknd > 7) |
                   between(income, 20000, 50000))
nrow(newdat)
nrow(newdat3)

#### Challenge -----------------------------------------------------

# start here!
nlsy_full <- read_rds("nlsy.rds")
colnames(nlsy_full) <- colnames_orig
levels(nlsy_full$res_1980)
levels(nlsy_full$res_2002)

nlsy_full <- mutate(nlsy_full,
  # fct_relabel -> tolower makes all the levels lowercase
  res_1980_new = fct_relabel(res_1980, tolower),
  res_2002_new = fct_relabel(res_2002, tolower),
  # rename all the mismatched labels
  res_2002_new = fct_recode(res_2002_new,
    "aboard ship, barracks" = "open bay or troop barracks, aboard ship",
    "bachelor, officer quarters" = "bachelor enlisted or officer quarters",
    "dorm, fraternity, sorority" = "dormitory, fraternity or sorority",
    "other temporary quarters" = "other temporary individual quarters",
    "on-base mil fam housing" = "on-base military family housing",
    "off-base mil fam housing" = "off-base military family housing",
    "religious institution" = "convent, monastery, other religious institute",
    "r in parental household" = "respondent in parent household"
  ),
  res_1980_new = fct_collapse(res_1980_new,
    "r in parental household" = c("r in parental household", "parental")
  ),
  res_2002_new = fct_expand(res_2002_new,
    "orphanage", "hhi conducted with parent"
  ),
  res_1980_new = fct_infreq(res_1980_new),
  # here fct_relevel will just rearrange the levels in the same order 
  # as the levels of the nlsy_res version
  res_2002_new = fct_relevel(res_2002_new, levels(res_1980_new))
)

nlsy_res <- select(nlsy_full, ends_with("_new"))
nlsy_res <- filter(nlsy_res, !is.na(res_1980_new), !is.na(res_2002_new))
nlsy_res
# still have the same levels
# that means they'll be easy to combine/compare
levels(nlsy_res$res_1980_new)
levels(nlsy_res$res_2002_new)
table(nlsy_res$res_1980_new, nlsy_res$res_2002_new)


# Exercises 1

# here's the code from the first part
nlsy <- mutate(nlsy, slp_cat_wkdy = case_when(
  sleep_wkdy < 5 ~ "little",
  sleep_wkdy < 7 ~ "some",
  sleep_wkdy < 9 ~ "ideal",
  sleep_wkdy < 12 ~ "lots",
  TRUE ~ NA_character_))
missing_sleep <- filter(nlsy, is.na(slp_cat_wkdy))
missing_sleep <- select(missing_sleep, starts_with("slp"), contains("sleep"))
missing_sleep

# 1. Rewrite the code above using pipes. Make sure you get the same dataset when your code runs!
#