library(fs) library(readr) library(arrow) library(dplyr) library(glue) library(gt) library(gtExtras) file_size <- function(file_path) { size_in_bytes <- file.info(file_path)$size return( fs::fs_bytes(size_in_bytes) ) } import_time_csv <- function(path, import_fun = .f, ...) { start_time <- Sys.time() csv_data <- import_fun(path, ...) end_time <- Sys.time() diff_time <- end_time - start_time return(diff_time) } import_time_parquet <- function(path, col_names = NULL) { start_time <- Sys.time() parquet_data <- open_dataset(path) if (!is.null(col_names)){ parquet_data <- parquet_data %>% select(any_of(col_names)) } parquet_data <- parquet_data %>% collect() end_time <- Sys.time() diff_time <- end_time - start_time return(diff_time) } import_time_parquet_partioned <- function( path = "./data/RPindividus.parquet", col_names = NULL ){ start <- Sys.time() parquet_data <- open_dataset( path, hive_style = TRUE, ) %>% filter(region == "24") if (!is.null(col_names)){ parquet_data <- parquet_data %>% select(any_of(col_names)) } parquet_data <- parquet_data %>% collect() diff_time <- Sys.time() - start return(diff_time) } create_results_df <- function(disk_usage, timings, dimensions) { results_df <- data.frame( format = c( rep(c("CSV", "Parquet", "Parquet", "Parquet partitionné"), each = 2), "CSV" ), cols = c( rep(c("Toutes", "Sous-ensemble"), times = 4), "Toutes" ), cols_number = c( rep(c(dimensions$complete[2], dimensions$sample[2]), 4), dimensions$complete[2] ), rows_number = c( rep(c(dimensions$complete[1], dimensions$sample[1]), 3), rep(dimensions$sample[1], 2), dimensions$complete[1] ), disk = c( rep(c( disk_usage$sample_csv, disk_usage$sample_parquet, disk_usage$full_parquet ), each = 2), rep(disk_usage$full_parquet, 2), disk_usage$full_csv ), import = as.numeric( c( timings$csv_sample, timings$csv_sample_subset, timings$parquet_sample, timings$parquet_sample_subset, timings$parquet_full, timings$parquet_full_subset, timings$parquet_partitioned_full, timings$parquet_partitioned_full_subset, timings$csv_full ), units = "secs"), sample = c( rep(TRUE, 4), rep(FALSE, 2), rep(TRUE, 2), FALSE ) ) results_df <- results_df %>% mutate( partitioned = case_when( (sample) & (grepl("partitionné", format)) ~ "✅️", (sample) & (format != "CSV" ) ~ "❌️", TRUE ~ "" ) ) %>% mutate( format = gsub(" partitionné", "", format) ) results_df <- results_df %>% mutate(disk = fs::fs_bytes(disk)) %>% mutate(disk_bar = as.numeric(disk)) %>% mutate( import_bar = import, cols_bar = cols_number, rows_number_bar = rows_number ) %>% select(order(colnames(.))) %>% mutate(cols = glue("_{cols}_ (**{cols_number}** colonnes)")) %>% mutate(emo = if_else(format == "Parquet", "🐎", "🐢")) %>% select( emo, format, partitioned, cols, sample, starts_with("rows_"), starts_with("cols_"), everything() ) %>% arrange(desc(sample), desc(cols), format) return(results_df) } create_report_table <- function(df){ tab <- gt(results_df) %>% cols_hide( columns = c("sample", "cols_bar", "cols_number", "rows_number", "rows_number_bar") ) %>% fmt_markdown(columns = c('sample', 'cols')) %>% fmt_number(columns = "import", decimals = 2) %>% gtExtras::gt_plt_bar( column = disk_bar, color = "#ff562c" ) %>% gtExtras::gt_plt_bar( column = cols_bar, color = "#ff562c" ) %>% gtExtras::gt_plt_bar( column = rows_number_bar, color = "#ff562c" ) %>% gtExtras::gt_plt_bar( column = import_bar, color = "#ff562c" ) %>% tab_spanner( label = md("**Configuration**"), columns = c("emo", "format", "partitioned", "cols", "sample", starts_with("cols_"), starts_with("rows_")) ) %>% tab_spanner(label = md("**Taille sur disque**
_(MiB ou GiB)_"), columns = starts_with("disk")) %>% tab_spanner(label = md("**Vitesse à l'import**
_(secondes)_"), columns = starts_with("import")) %>% cols_label( emo = "", format = "*Format du fichier*", partitioned = "*Partitionné?*", cols = "*Colonnes*", sample = "*Echantillon de données ?*", disk = "", import = "", ends_with("_bar") ~ "", .fn = md ) %>% tab_row_group( label = md( glue("**Seulement le Centre Val de Loire ({nrows} observations)**", nrows = format(dims_sample[1], big.mark=" ")) ), rows = (sample == TRUE), id = "sample" ) %>% tab_row_group( label = md( glue("**Ensemble des données ({nrows} observations)**", nrows = format(dims_complete[1], big.mark=" ")) ), rows = (sample == FALSE), id = "full" ) %>% tab_style( style = list( cell_fill(color = "#4758AB"), cell_text(color = "white") ), locations = cells_row_groups() ) %>% tab_style( style = list( cell_borders( sides = c("top"), color = "#4758AB", weight = px(2) ) ), locations = list( cells_body( rows = (format == "CSV" & cols != " _Toutes_ (**88** colonnes)") ) ) ) %>% tab_footnote( footnote = md("Poids de l'ensemble des données, y compris régions différentes"), locations = cells_body( rows = (format == "Parquet" & partitioned == "✅️"), columns = "disk" ) ) return(tab) }