---
title: "Salary Survey"
date: 2021-05-18
output: html_output
---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(tidytuesdayR)
library(scales)
library(lubridate)
theme_set(theme_light())
```

# Load the weekly Data

Dowload the weekly data and make available in the `tt` object.

```{r Load}

tt <- tt_load("2021-05-18")

survey <- tt$survey %>%
  mutate(timestamp = mdy_hms(timestamp),
         age_category = fct_relevel(fct_reorder(how_old_are_you, parse_number(how_old_are_you)), "under 18"),
         experience_overall = str_replace(overall_years_of_professional_experience, " - ", "-"),
         experience_overall = fct_reorder(experience_overall, parse_number(experience_overall)),
         experience_in_field = str_replace(years_of_experience_in_field, " - ", "-"),
         experience_in_field = fct_reorder(experience_in_field, parse_number(experience_in_field))) %>%
  mutate(gender = fct_collapse(coalesce(gender, "Other or prefer not to answer"), "Other or prefer not to answer" = c("Other or prefer not to answer", "Prefer not to answer")),
         race = fct_lump(coalesce(race, "Other"), 4))

survey %>%
  count(years_of_experience_in_field)

survey %>%
  count(industry, sort = TRUE)

survey %>%
  count(job_title, sort = TRUE)

survey %>%
  count(currency, sort = TRUE)
```

```{r}
survey_usd <- survey %>%
  filter(currency == "USD") %>%
  filter(annual_salary >= 5000,
         annual_salary <= 2e6) %>%
  mutate(state = str_remove(state, ", .*"))
```

```{r}
survey_usd %>%
  ggplot(aes(annual_salary)) +
  geom_histogram() +
  scale_x_log10(labels = dollar_format()) +
  labs(x = "Annual")

summarize_salary <- function(tbl) {
  tbl %>%
    summarize(n = n(),
              median_salary = median(annual_salary)) %>%
    arrange(desc(n))
}

plot_categorical <- function(tbl, column, n_levels = 9, reorder = TRUE) {
  lumped_tbl <- tbl %>%
    filter(!is.na({{ column }})) %>%
    mutate({{ column }} := fct_lump({{ column }}, n_levels))
  
  if (reorder) {
    lumped_tbl <- lumped_tbl %>%
      mutate({{ column }} := fct_reorder({{ column }}, annual_salary))
  }

  lumped_tbl %>%
    group_by({{ column }}) %>%
    summarize_salary() %>%
    ggplot(aes(median_salary, {{ column }})) +
    geom_col() +
    scale_x_continuous(labels = dollar_format()) +
    labs(x = "Median salary")
}

survey_usd %>%
  plot_categorical(state)

survey_usd %>%
  plot_categorical(industry)

survey_usd %>%
  plot_categorical(job_title, n_levels = 10)

survey_usd %>%
  plot_categorical(experience_overall, reorder = FALSE)

survey_usd %>%
  plot_categorical(experience_in_field, reorder = FALSE)

survey_usd %>%
  plot_categorical(age_category, reorder = FALSE)

survey_usd %>%
  plot_categorical(gender)

survey_usd %>%
  plot_categorical(race, n_levels = 4)
```


### ANOVA

```{r}
survey_usd %>%
  filter(!is.na(experience_overall)) %>%
  ggplot(aes(annual_salary, experience_overall)) +
  geom_boxplot() +
  scale_x_log10()

library(broom)

lm(log2(annual_salary) ~ experience_overall, data = survey_usd)
lm(log2(annual_salary) ~ experience_in_field, data = survey_usd) %>%
  summary()

survey_usd %>%
  mutate(job_title = fct_lump(job_title, 20)) %>%
  lm(log2(annual_salary) ~ job_title, data = .) %>%
  summary()

survey_usd %>%
  mutate(job_title = fct_lump(job_title, 10),
         state = fct_lump(state, 10),
         industry = fct_lump(industry, 10)) %>%
  lm(log2(annual_salary) ~ job_title + state + city + experience_in_field + gender + race + industry, data = .) %>%
  anova() %>%
  tidy() %>%
  mutate(pct_variation = sumsq / sum(sumsq)) %>%
  arrange(desc(pct_variation))
```


```{r}
survey_usd %>%
  count(overall_years_of_professional_experience, sort = TRUE)
```

### Machine Learning

```{r}
library(tidymodels)

set.seed(2021)

survey_usd_split <- initial_split(survey_usd)

survey_usd_training <- training(survey_usd_split)
survey_usd_testing <- testing(survey_usd_split)

rec <- survey_usd_training %>%
  recipe(annual_salary ~ job_title + state + city + experience_in_field + gender + race + industry +
           highest_level_of_education_completed) %>%
  step_novel(all_nominal()) %>%
  step_unknown(job_title, industry, state, city, highest_level_of_education_completed) %>%
  step_log(annual_salary, base = 2) %>%
  step_other(job_title, industry, state, city, threshold = tune())
```

```{r}
training_cv <- vfold_cv(survey_usd_training)

threshold_grid <- crossing(threshold = c(.001, .003, .01, .03, .1))

linear_model_cv_tune_threshold <- linear_reg() %>%
  set_engine("lm") %>%
  tune_grid(rec, training_cv, grid = threshold_grid)

linear_model_cv_tune_threshold %>%
  autoplot() +
  scale_x_log10()
```

Choose a threshold of .001.
```{r}
rec_with_threshold <- rec %>%
  finalize_recipe(list(threshold = .001))
```

Compare to random forests.

```{r}
doParallel::registerDoParallel(cores = 4)

rf_grid <- crossing(mtry = c(2, 3, 4, 5),
                    trees = c(30, 100, 200, 300))

training_cv5 <- vfold_cv(survey_usd_training, v = 5)

linear_model_cv <- linear_reg() %>%
  set_engine("lm") %>%
  fit_resamples(rec_with_threshold, training_cv5)

random_forest_tune <- rand_forest(mode = "regression", mtry = tune(), trees = tune()) %>%
  set_engine("ranger") %>%
  tune_grid(rec_with_threshold, training_cv5, grid = rf_grid,
            control = control_grid(verbose = TRUE))

linear_model_metrics <- linear_model_cv %>%
  collect_metrics()

linear_model_metrics

random_forest_tune %>%
  collect_metrics() %>%
  filter(.metric == "rsq") %>%
  ggplot(aes(trees, mean, color = factor(mtry))) +
  geom_line() +
  geom_hline(yintercept = linear_model_metrics$mean[2], lty = 2) +
  labs(y = "R Squared",
       color = "# of splits",
       x = "# of trees")
```