# load packages
library(tidyverse)
library(rvest)
library(janitor)

# download 538 college major data
collmaj538_file <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv"
download.file(collmaj538_file, "data/collmaj538.csv")

# Social Security Data
birth_file <- "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

ssa <- birth_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> #pull tables only
  pluck(1) |> #pull first table only
  html_table()  |> #converts to rectangular data
  clean_names()

#trend in %male over time
#convert columns to numeric
#need % male variable
ssa <- ssa |> 
  #converting to numeric by first removing commas
  mutate(male = as.numeric(str_remove_all(male, ",")),
         female = as.numeric(str_remove_all(female, ",")),
         total = as.numeric(str_remove_all(total, ","))) |> 
  mutate(perc_male = male/total)

ggplot(ssa, aes(x = year_of_birth, y = perc_male)) +
  geom_line()

ggplot(ssa, aes(x = year_of_birth, y = total)) +
  geom_line()


# Wikipedia State Population Data
statepop_file <- "https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population"

statepop <- statepop_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> #pull tables only
  pluck(1) |> #pull first table only
  html_table()  |> #converts to rectangular data
  clean_names()

pop_totals <- statepop |> 
  slice(57:60) #extract "total" rows at the bottom

statepop <- statepop |> 
  slice(1:56) |> 
  select(3, 4, 9)

names(statepop) <- c("state", "pop2020", "pop_per_electoral2020")

statepop |> 
 mutate(pop2020 = as.numeric(str_remove_all(pop2020, ",")))

statepop <- statepop |> 
  mutate(across(c(pop2020, pop_per_electoral2020),
                ~ as.numeric(str_remove_all(.x, ","))))

saveRDS(statepop, "data/statepop_2020.RDS")

# PSSA Data
pssa_file <- "https://www.education.pa.gov/DataAndReporting/Assessments/Pages/PSSA-Results.aspx"

pssa_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> 
  html_table()

pssa_ela <- pssa_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> #pull tables only
  pluck(2) |> #pull first table only
  html_table()  |> #converts to rectangular data
  clean_names()

pssa_math <- pssa_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> #pull tables only
  pluck(3) |> #pull first table only
  html_table()  |> #converts to rectangular data
  clean_names()

pssa_all <- bind_rows(pssa_ela, pssa_math)

saveRDS(pssa_all, "data/pssa_2025.RDS")

# Reddit example
library(httr)
library(jsonlite)
library(dplyr)
library(tibble)
# Reddit API search for "Villanova"
url <- "https://www.reddit.com/search.json?q=Villanova&limit=100"
res <- GET(url, user_agent("vu-class-app"))
txt <- content(res, as = "text", encoding = "UTF-8")
dat <- fromJSON(txt)
# Extract post data
VU_posts <- dat$data$children$data


# Olympics data

olympics_file <- "https://en.wikipedia.org/wiki/100_metres_at_the_Olympics#Olympic_record_progression"

men <- olympics_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> 
  html_table() |> 
  pluck(13) |> #extract men's 100m records
  clean_names()

women <- olympics_file |>
  read_html() |> #read in raw html file
  html_nodes("table") |> 
  html_table() |> 
  pluck(14) |> #extract women's 100m records
  clean_names()

all <- bind_rows(.id = "sex", "M" = men, "F" = women)
saveRDS(all, "data/olympics.RDS")