#' Harmonize WCPFC Nominal Catch Datasets #' #' This function harmonizes WCPFC nominal catch datasets for integration into the Tuna Atlas database, ensuring data compliance with specified format requirements. #' #' @param action The action context from geoflow, used for controlling workflow processes. #' @param entity The entity context from geoflow, which manages dataset-specific details. #' @param config The configuration context from geoflow, used for managing global settings. #' #' @return None; the function outputs files directly, including harmonized datasets, #' optional metadata, and code lists for integration within the Tuna Atlas database. #' #' @details This function modifies the nominal catch dataset to ensure compliance with the standardized #' format, including renaming, reordering, and recalculating specific fields as necessary. #' Metadata integration is contingent on the intended use within the Tuna Atlas database. #' #' @importFrom readr read_csv #' @importFrom dplyr %>% filter select mutate group_by summarise #' @seealso \code{\link{format_time_db_format}} for converting WCPFC task 2 data structures. #' @export #' @keywords data harmonization, fisheries, WCPFC, tuna #' @author Paul Taconet, IRD \email{paul.taconet@ird.fr} #' @author Bastien Grasset, IRD \email{bastien.grasset@ird.fr} # Input data sample: # yy gear flag fleet alb_mt bet_mt pbf_mt skj_mt yft_mt blm_mt bum_mt mls_mt swo_mt ham_mt mak_mt ocs_mt por_mt fal_mt thr_mt # 1950 H PH 0 0 0 0 1196 32 508 0 19 0 0 0 0 0 0 # 1950 K PH 0 0 0 1056 4784 0 0 0 0 0 0 0 0 0 0 # 1950 L JP DW 16713 17463 0 0 12575 0 0 0 0 0 0 0 0 0 0 # 1950 L US HW 27 781 0 34 269 0 0 0 0 0 0 0 0 0 0 # 1950 O ID 0 0 0 2645 625 0 0 0 0 0 0 0 0 0 0 # 1950 O PH 0 0 0 2782 2314 0 0 0 0 0 0 0 0 0 0 # Catch: final data sample: # FishingFleet Gear time_start time_end AreaName School Species CatchType CatchUnits Catch # AU L 1985-01-01 1986-01-01 WCPFC ALL YFT ALL MT 9 # AU L 1986-01-01 1987-01-01 WCPFC ALL BET ALL MT 1 # AU L 1986-01-01 1987-01-01 WCPFC ALL YFT ALL MT 13 # AU L 1987-01-01 1988-01-01 WCPFC ALL ALB ALL MT 129 # AU L 1987-01-01 1988-01-01 WCPFC ALL BET ALL MT 64 # AU L 1987-01-01 1988-01-01 WCPFC ALL BLM ALL MT 17 function(action, entity, config){ source("https://raw.githubusercontent.com/firms-gta/geoflow-tunaatlas/master/R/sardara_functions/format_time_db_format.R") #packages if(!require(dplyr)){ install.packages("dplyr") require(dplyr) } if(!require(reshape)){ install.packages("reshape") require(reshape) } if(!require(lubridate)){ install.packages("lubridate") require(lubridate) } if(!require(data.table)){ install.packages("data.table") require(data.table) } if(!require(reshape2)){ install.packages("reshape2") require(reshape2) } #---------------------------------------------------------------------------------------------------------------------------- #@geoflow --> with this script 2 objects are pre-loaded #config --> the global config of the workflow #entity --> the entity you are managing #get data from geoflow current job dir filename1 <- entity$data$source[[1]] #WCPFC data filename2 <- entity$data$source[[2]] #WCPO data filename3 <- entity$data$source[[3]] #structure path_to_raw_dataset1 <- entity$getJobDataResource(config, filename1) #WCPFC data path_to_raw_dataset2 <- entity$getJobDataResource(config, filename2) #WCPO data config$logger.info(sprintf("Pre-harmonization of dataset '%s'", entity$identifiers[["id"]])) opts <- options() options(encoding = "UTF-8") #---------------------------------------------------------------------------------------------------------------------------- ### Nominal catches #from wcpfc wcpfc_species = c("ALV", "BLM", "BSH", "BTH", "BUM", "FAL", "LMA", "MAK", "OCS", "POR", "PTH", "RHN", "SMA", "SPK", "SPL", "SPN", "SPY", "SPZ", "THR") NC1<-read.csv(path_to_raw_dataset1) NC1<-NC1[NC1$SP_CODE %in% wcpfc_species,] #from wcpo wcpo_species = c("ALB", "BET", "MLS", "PBF", "SKJ", "SWO", "YFT") NC2<-read.csv(path_to_raw_dataset2) NC2<-NC2[NC2$SP_CODE %in% wcpo_species,] NC2$AreaName <- "WCPO" NC1$AreaName <- "WCPFC" #bind both sources NC <- rbind(NC1,NC2) colnames(NC)[colnames(NC) == "YY"] <- "Year" colnames(NC)[colnames(NC) == "FLAG_CODE"] <- "FishingFleet" colnames(NC)[colnames(NC) == "GEAR_CODE"] <- "Gear" colnames(NC)[colnames(NC) == "SP_CODE"] <- "Species" colnames(NC)[colnames(NC) == "SP_MT"] <- "Catch" NC$Catch<-as.numeric(NC$Catch) NC <- NC[!is.na(NC$Catch),] NC <- NC[NC$Catch != 0,] NC$CatchUnits <- "t" NC$SP_NAME <- NULL NC$FLEET_CODE <- NULL NCAreaCWPgrid<-NA NC$School<-"UNK" NC$CatchType<-"NC" NC$CatchUnits<-"t" NC$RFMO<-"WCPFC" NC$Ocean<-"PAC_W" NC$MonthStart<-1 NC$Period<-12 #Format inputDataset time to have the time format of the DB, which is one column time_start and one time_end NC<-format_time_db_format(NC) NC <- NC[NC$Catch !=0 ,] #not sure if needed NC <-NC[c("FishingFleet","Gear","time_start","time_end","AreaName","School","Species","CatchType","CatchUnits","Catch")] # remove 0 and NA values NC <- NC[!is.na(NC$Catch),] NC <- NC[NC$Catch != 0,] NC <- aggregate(NC$Catch, FUN = sum, by = list( FishingFleet = NC$FishingFleet, Gear = NC$Gear, time_start = NC$time_start, time_end = NC$time_end, AreaName = NC$AreaName, School = NC$School, Species = NC$Species, CatchType = NC$CatchType, CatchUnits = NC$CatchUnits ) ) colnames(NC)<-c("fishing_fleet","gear_type","time_start","time_end","geographic_identifier","fishing_mode","species","measurement_type", "measurement_unit","measurement_value") NC$source_authority<-"WCPFC" NC$measurement<-"catch" NC$measurement_processing_level<-"raised" #---------------------------------------------------------------------------------------------------------------------------- #@eblondel additional formatting for next time support NC$time_start <- as.Date(NC$time_start) NC$time_end <- as.Date(NC$time_end) #we enrich the entity with temporal coverage dataset_temporal_extent <- paste( paste0(format(min(NC$time_start), "%Y"), "-01-01"), paste0(format(max(NC$time_end), "%Y"), "-12-31"), sep = "/" ) entity$setTemporalExtent(dataset_temporal_extent) base1 <- tools::file_path_sans_ext(basename(filename1)) #@geoflow -> export as csv # output in same folder as path_to_raw_dataset output_name_dataset <- file.path(dirname(path_to_raw_dataset1), paste0(base1, "_harmonized.csv")) output_name_codelists <- file.path(dirname(path_to_raw_dataset1), paste0(base1, "_codelists.csv")) file.rename( from = entity$getJobDataResource(config, filename3), to = output_name_codelists) #---------------------------------------------------------------------------------------------------------------------------- write.csv(NC, output_name_dataset, row.names = FALSE) #---------------------------------------------------------------------------------------------------------------------------- entity$addResource("source", c(path_to_raw_dataset1, path_to_raw_dataset2)) entity$addResource("harmonized", output_name_dataset) entity$addResource("codelists", output_name_codelists)}