library(tidyverse) library(DBI) library(arrow) csv_dir <- file.path(Sys.getenv("RAW_DATA_DIR"), "sirca") pq_dir <- file.path(Sys.getenv("DATA_DIR"), "sirca") if (!dir.exists(pq_dir)) dir.create(pq_dir) si_au_ref_names_csv <- file.path(csv_dir, "si_au_ref_names.csv.gz") si_au_ref_names <- read_csv(si_au_ref_names_csv, guess_max = Inf, show_col_types = FALSE) |> mutate(across(c(SeniorSecurity, ListDate_DaysSince, DelistDate_DaysSince, RecordCount, GICSIndustry, SIRCAIndustryClassCode, SIRCASectorCode), as.integer), across(ends_with("Date"), \(x) dmy(if_else(x == "0/01/1900", NA, x)))) |> select(-ends_with("_YMD")) |> distinct() |> filter(!(Gcode == "rgwb1" & is.na(GICSIndustry))) |> rename_with(str_to_lower) |> write_parquet(sink = file.path(pq_dir, "si_au_ref_names.parquet")) si_au_ref_trddays_csv <- file.path(csv_dir, "si_au_ref_trddays.csv.gz") si_au_ref_trddays <- read_csv(si_au_ref_trddays_csv, col_types = "-iDii") |> relocate(date) |> write_parquet(sink = file.path(pq_dir, "si_au_ref_trddays.parquet")) si_au_retn_mkt_csv <- file.path(csv_dir, "si_au_retn_mkt.csv.gz") si_au_retn_mkt <- read_csv(si_au_retn_mkt_csv, col_types = "-iDdddddd", locale = locale(date_format = "%d/%m/%Y"), name_repair = str_to_lower) |> relocate(date) |> write_parquet(sink = file.path(pq_dir, "si_au_retn_mkt.parquet")) si_au_prc_daily_csv <- file.path(csv_dir, "si_au_prc_daily.csv.gz") si_au_prc_daily_pq <- file.path(pq_dir, "si_au_prc_daily.parquet") export_parquet <- function(df, file) { db <- df[["src"]][["con"]] df <- dplyr::collapse(df) sql <- paste0("COPY (", dbplyr::remote_query(df), ") TO '", file, "'") DBI::dbExecute(db, sql) invisible(df) } db <- dbConnect(duckdb::duckdb()) si_au_prc_daily <- tbl(db, str_c("read_csv('", si_au_prc_daily_csv, "', DateFormat = '%Y%m%d', types = {'dateymd': 'DATE', 'dayssince': 'INTEGER', 'weekday': 'INTEGER', 'monthend': 'BOOLEAN', 'seniorsecurity': 'INTEGER'})")) |> select(-date) |> rename(date = dateymd) |> export_parquet(file = si_au_prc_daily_pq) dbDisconnect(db)