library(tidyverse)

options(stringsAsFactors = FALSE)

# replace or remove this if you're not me
my_path <- "nval/ad-hocs"

# replace or remove this if you're not me with path to downloaded and merged files from repo
ccc_path <- "c:/users/ulfel/documents/nval/ccc/data_clean/ccc_compiled.csv"

ccc <- read.csv(ccc_path) %>%
  mutate(date = lubridate::date(date),
         fips_code = ifelse(nchar(fips_code) == 4, paste0("0", fips_code), fips_code),
         issues = ifelse(issues == "", "other", issues)) %>%
  filter(!is.na(date))

# regular expressions to spot various markers for DJT, Biden, and Harris in 'notes' field
regex_trump <- "f(ea)?t. (?:(former president )?donald (j\\. )?trump(?!,? jr)|president trump)"
regex_biden <- "f(ea)?t. (?:president|jo(?:e|seph)) biden"
regex_harris <- "f(ea)?t. (?:v(ice )?p(resident)?( kamala)?|kamala) harris"

# code I initially used to cast a wide net and tweak regexes until they worked as expected
# ccc$rallytag <- with(ccc, case_when(
# 
#   grepl(regex_trump, notes, perl = TRUE, ignore.case = TRUE) ~ "trump",
#   grepl(regex_biden, notes, perl = TRUE, ignore.case = TRUE) ~ "biden",
#   grepl(regex_harris, notes, perl = TRUE, ignore.case = TRUE) ~ "harris",
#   .default = ""
# 
# ))
# rally_check_set <- filter(ccc, grepl("\\btrump\\b|\\bbiden\\b|\\bharris\\b", notes, ignore.case = TRUE) & date >= "2021-01-01")
# write.csv(rally_check_set, "c:/users/ulfel/documents/nval/ad-hocs/rally-check-set-20240812.csv", row.names = FALSE)

trump <- ccc |>
  filter(grepl("^(campaign )?rally$", type, perl = TRUE, ignore.case = TRUE)) |>
  filter(grepl(regex_trump, notes, perl = TRUE, ignore.case = TRUE)) |>
  mutate(year = lubridate::year(date))

trump |>
  group_by(year) |>
  summarize(n_events = n(),
            n_na = sum(is.na(size_mean)),
            size_avg = mean(size_mean, na.rm = TRUE),
            size_median = median(size_mean, na.rm = TRUE),
            size_min = min(size_mean, na.rm = TRUE),
            size_max = max(size_mean, na.rm = TRUE))

biden <- ccc |>
  filter(grepl("^(campaign )?rally$", type, perl = TRUE, ignore.case = TRUE)) |>
  filter(grepl(regex_biden, notes, perl = TRUE, ignore.case = TRUE)) |>
  filter(date <= "2024-07-24") |>
  mutate(year = lubridate::year(date))

biden |>
  group_by(year) |>
  summarize(n_events = n(),
            n_na = sum(is.na(size_mean)),
            size_avg = mean(size_mean, na.rm = TRUE),
            size_median = median(size_mean, na.rm = TRUE),
            size_min = min(size_mean, na.rm = TRUE),
            size_max = max(size_mean, na.rm = TRUE))

harris <- ccc |>
  filter(grepl("^(campaign )?rally$", type, perl = TRUE, ignore.case = TRUE)) |>
  filter(grepl(regex_harris, notes, perl = TRUE, ignore.case = TRUE)) |>
  filter(date > "2024-07-24") |>
  filter(grepl("harris for president", organizations, ignore.case = TRUE))

harris |>
  summarize(n_events = n(),
            n_na = sum(is.na(size_mean)),
            size_avg = mean(size_mean, na.rm = TRUE),
            size_median = median(size_mean, na.rm = TRUE),
            size_min = min(size_mean, na.rm = TRUE),
            size_max = max(size_mean, na.rm = TRUE))