# +++++++++++++++++++++++++
# +++ R BOOTCAMP UNIT 6 +++
# +++      Example      +++
# +++++++++++++++++++++++++
# + AUTHOR: David Dobolyi +
# +++++++++++++++++++++++++

# ---------------------
# Set Working Directory
# ---------------------

# setwd("...")

# -------------
# Load Packages
# -------------

library(dplyr)
library(nycflights13)
library(ggplot2)
library(lubridate)
library(gridExtra)

# install.packages(c("dplyr", "nycflights13", "ggplot2", "lubridate", "gridExtra"))

# ---------
# Load Data
# ---------

data(flights)

head(flights)
dim(flights)
str(flights)

# -----------------
# Data Manipulation
# -----------------

# create mean arr and dep delay columns per dplyr vignette example (https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html), making sure to include the year, month, and day columns to invoid a red message about "Adding missing grouping variables"

worstDepartures <- flights %>%
  group_by(year, month, day) %>%
  select(year, month, day, arr_delay, dep_delay) %>%
  summarise(
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ) %>%
  filter(arr > 30 | dep > 30)

worstDepartures # inspect first 10 rows as a tibble

# date conversions (via lubridate)

worstDepartures <- worstDepartures %>%
  mutate(Date = ymd(paste(year, month, day, sep = "-"))) %>% # convert year-month-day into valid Dates
  mutate(MonthFloor = floor_date(Date, unit = "month")) # floor the date to to create month groups

# --------
# Plotting
# --------

# generate plot

p1 <- ggplot(worstDepartures, aes(x = MonthFloor, y = stat(count), fill = stat(count))) + 
  geom_bar(stat = "count") # regarding stat(count), see https://ggplot2.tidyverse.org/reference/stat.html

# further modify existing plot

p2 <- p1 + 
  theme_bw(base_size = 18) + 
  scale_y_continuous(breaks = seq(0, 10, by = 2)) + 
  scale_x_date(date_breaks = "1 months", date_labels = "%b") + # for more on date_labels, see Unit 3
  scale_fill_gradient(low = "grey", high = "red") + 
  labs(title = "Months with Highly Delayed Flights in NYC During 2013",
       x = "Month",
       y = "Number of Highly Delayed Flights") + 
  guides(fill = FALSE) # turn off the fill legend

# view plots individually

p1
p2

# view plots collectively

grid.arrange(p1, p2, ncol = 1)

# save plot to disk

ggsave("WorstDepartureMonths.pdf", width = 10, height = 5)

# ------------------
# Follow-Up Analysis
# ------------------

# see sorted average delay in the worst month (i.e., June)

worstDepartures %>%
  filter(month == 6) %>%
  arrange(desc(arr, dep))

# select the worst day in June

worstDepartures %>%
  filter(month == 6) %>%
  arrange(desc(arr, dep)) %>%
  head(1) %>%
  select(year, month, day)

browseURL("http://lmgtfy.com/?q=june+2013+flight+delays+nyc")

# https://www.usatoday.com/story/todayinthesky/2013/06/13/severe-storms-snarl-flights-across-the-east/2418761/