###################################
# Aspen Coyle, afcoyle@uw.edu
# Roberts Lab, UW-SAFS
# 2022-07-27
# File containing functions created during Hematodinium modeling project
###################################

#### read_tidbit_data(): Reads in all Tidbit data within a folder 

# Takes the following inputs:
#     data_path: Path to data folder for that specific survey or year. Don't include the last /

read_tidbit_data  <- function(data_path) {
  # Load all required libraries
  require(tidyverse)
  require(readxl)
  require(tools)
  
  # Get full path to all containing files
  tidbits <- list.files(path = data_path, full.names = TRUE, recursive = TRUE)
  
  # See if any non-Tidbit files in there. All non-Tidbit files have "_data" somewhere in the name. If so, remove them.
  # Unfortunately, so does our filepath, so we'll search for either files with "_data." or "_data_"
  # And since our formatting doesn't work if none are TRUE, we gotta do it separately
  if (any(grepl("_data.", tidbits, fixed = TRUE)) == TRUE){
    tidbits <- tidbits[-grep("_data.", tidbits, fixed = TRUE)]
  }
  if (any(grepl("_data_", tidbits, fixed = TRUE)) == TRUE){
    tidbits <- tidbits[-grep("_data_", tidbits, fixed = TRUE)]
  }
  
  
  
  # Extract year of survey
  get_year <- unlist(strsplit(data_path, split = '/', fixed = TRUE))[6]
  
  # Initialize matrix
  tidbit_data <- matrix(nrow = 0, ncol = 4)
  
  for (i in 1:length(tidbits)) {
    
    #ID the file extension
    file_type <- file_ext(tidbits[i])
    
    # If file type is xls or xlsx, read in as excel
    if (file_type == "xls" | file_type == "xlsx") {
      new_tidbit <- read_excel(path = tidbits[i], col_names = TRUE, skip = 1)
      # Remove all columns aside from 2 and 3 (1 is rownames)
      new_tidbit <- new_tidbit %>%
        select(2,3)
      # Rename columns
      names(new_tidbit) <- c("DateTime", "Temp")
      # Split DateTime column
      suppressWarnings(
        new_tidbit <- new_tidbit %>%
          separate(DateTime, into = c("Date", "Time"), sep = " ", extra = "merge")
      )
      # Drop rows with NAs
      new_tidbit <- na.omit(new_tidbit)
      
    } else if (file_type == "txt" | file_type == "TXT") {
      new_tidbit <- read.delim(file = tidbits[i], row.names = NULL)
      
    } else if (file_type == "csv" &  get_year != "2019") {
      # Find max number of columns in file
      max_cols <- max(count.fields(tidbits[i], sep = ","))
      # Read in file, prefilling col names
      new_tidbit <- read.csv(file = tidbits[i], col.names = c("Row", "DateTime", "Temp", rep("Trash", times = max_cols - 3)), skip = 2)
      # Remove all columns aside from 2 and 3 (1 is rownames)
      new_tidbit <- new_tidbit %>%
        select(c("DateTime", "Temp"))
      # Split DateTime column
      suppressWarnings(
        new_tidbit <- new_tidbit %>%
          separate(DateTime, into = c("Date", "Time"), sep = " ", extra = "merge")
      )
      # Drop rows with NAs
      new_tidbit <- na.omit(new_tidbit)
    } else if (file_type == "csv" & get_year == "2019") {
      new_tidbit <- read.csv(file = tidbits[i])
      new_tidbit <- na.omit(new_tidbit)
    } else {
      print("Aidan says unknown file type!")
      break
    }
    
    # If first column was interpreted as the names of a row (thanks to misplaced spacing), edit
    if ("row.names" %in% names(new_tidbit)) {
      names(new_tidbit) <- c("Date", "Time", "Temp")
    } else {
    }
    
    # If last column was interpreted as two separate columns (thanks to misplaced spacing), remove last one
    # Remove all columns with only NAs
    new_tidbit <- new_tidbit[colSums(!is.na(new_tidbit)) > 0]
    # Remove all columns listed as "Junk"
    if ("Junk" %in% names(new_tidbit)) {
      new_tidbit <- new_tidbit %>%
        select(-Junk)
    }
    
    
    # If we have two columns, they're "DateTime" and "Temp"
    # If we have three, they're "Date", "Time", and "Temp"
    # If we have four, they're "Date", "Time", "Temp", and "Fahrenheit temp" (to be deleted)
    # Either way, standardize names of all columns
    if (length(names(new_tidbit)) == 2) {
      names(new_tidbit) <- c("tidbit_datetime", "Temp")
    } else if (length(names(new_tidbit)) == 3) {
      names(new_tidbit) <- c("Date", "Time", "Temp")
    } else if (length(names(new_tidbit)) == 5) {
    } else if (length(names(new_tidbit)) == 4) {
      names(new_tidbit) <- c("Date", "Time", "Temp", "Fahrenheit")
      new_tidbit <- new_tidbit %>%
        select(-Fahrenheit)
    } else {
      print("Aidan says error in columns!")
      break
    }
    
    # Get name of survey
    get_survey <- unlist(strsplit(tidbits[i], split = "/", fixed = TRUE))[7]
    
    # Extract ID from name of Tidbit file, removing ending
    get_id <- tail(strsplit(tidbits[i], split = "/", fixed = TRUE)[[1]], n = 1)
    get_id <- str_remove(get_id, paste0(".", file_type))
    
    # Add year, survey, and ID to the tidbit data column
    new_tidbit$year <- get_year
    new_tidbit$survey <- get_survey
    new_tidbit$tidbit_id <- get_id
    
    # Create datetime column from existing columns
    # Change Date and Time columns to correct object type
    
    # If the tidbit measures data to the second, it parses time differently
    # than those that measure data to the minute
    # Some are also stored as dates + times, which we just need to specify
    
    # First: if it uses datetime
    if ("tidbit_datetime" %in% names(new_tidbit)) {
      new_tidbit$tidbit_datetime <- mdy_hms(new_tidbit$tidbit_datetime, tz = "US/Alaska")
      # Next: if it uses AM/PM
    } else if (any(str_detect(new_tidbit$Time, " AM")) == TRUE) {
      new_tidbit$Date <- mdy(new_tidbit$Date, tz = "US/Alaska")
      new_tidbit$Time <- parse_time(new_tidbit$Time, "%I:%M:%S %p")
      new_tidbit$tidbit_datetime <- paste(new_tidbit$Date, new_tidbit$Time)
      new_tidbit <- new_tidbit %>%
        select(-c(Date, Time))  
      # If time is in minutes
    } else if (max(nchar(new_tidbit$Time)) < 6) {
      new_tidbit$Date <- mdy(new_tidbit$Date, tz = "US/Alaska")
      new_tidbit$Time <- parse_time(new_tidbit$Time, "%H:%M")
      new_tidbit$tidbit_datetime <- paste(new_tidbit$Date, new_tidbit$Time)
      new_tidbit <- new_tidbit %>%
        select(-c(Date, Time))  
      # If time is in seconds
    } else if (max(nchar(new_tidbit$Time)) < 18) {
      new_tidbit$Date <- mdy(new_tidbit$Date, tz = "US/Alaska")
      new_tidbit$Time <- parse_time(new_tidbit$Time, "%H:%M:%S")
      new_tidbit$tidbit_datetime <- paste(new_tidbit$Date, new_tidbit$Time)
      new_tidbit <- new_tidbit %>%
        select(-c(Date, Time))
    } else {
      print("Aidan says time setup broken!")
      break
    }
    
    # Standardize length of all columns
    new_tidbit <- new_tidbit[, 1:5]
    
    
    # Join new Tidbit info to full Tidbit dataframe
    tidbit_data <- rbind(tidbit_data, new_tidbit)
  }
  # Set the temperature column to numeric
  tidbit_data$Temp <- as.numeric(tidbit_data$Temp)
  
  # Reorder data columns for consistency
  tidbit_data <- tidbit_data %>%
    select(year, survey, tidbit_id, tidbit_datetime, Temp)
  
  # Return tidbit data
  tidbit_data
}


#### fix_long_csvs() ---------------------------------

fix_long_csvs <- function(filepath) {
  
  issue <- read.csv(filepath)
  
  # Drop the unneeded column of Fahrenheit values
  issue <- issue %>%
    select(-Temperature....F.)
  
  # Split date and time columns up
  issue <- issue %>%
    separate(Date.Time, c("Date", "Time"), sep = " ")
  
  # Done! Just write it out
  write.table(issue, file = filepath,
              sep = "\t",
              row.names = FALSE)
  
}

#### fix_timetemp_comma() ----------------------------------

# This quick function just splits apart tables
# where format is column 1 = date, column 2 = time,temp

fix_timetemp_comma <- function(filepath) {
  
  issue <- read.delim(file = filepath)
  
  # Split the time/temp column apart using the comma
  issue <- issue %>%
    separate(2, c("Time", "Temp"), sep = ",")
  
  # Done! Write it out
  write.table(issue, file = filepath,
              sep = "\t",
              row.names = FALSE)
}

#### fix_longhead_txt() -----------------------------

# This quick function is meant to fix issues in the 2016-2019 data 
# in which .txt files are reading in with too many columns

fix_longhead_txt <- function(filepath) {
  # Get maximum number of columns in file
  max_cols <- max(count.fields(filepath, sep = "\t"))
  
  # Read in file, prefilling column names
  issue <- read.delim(file = filepath, col.names = c("DateTime", "Temp", rep("Trash", times = max_cols - 2)))
  
  # Remove all columns aside from 1 and 2
  issue <- issue %>%
    select(c("DateTime", "Temp"))
  
  # Split the DateTime column
  suppressWarnings(
    issue <- issue %>%
      separate(DateTime, into = c("Date", "Time"), sep = " ")
  )
  
  # Write out file
  write.table(issue, file = filepath,
              sep = "\t",
              row.names = FALSE)
  
}


#### fix_txt_headers()------------------------

#This quick function is meant to fix issues in the 2018-2019 dataset in which R misreads the columns

fix_txt_headers <- function(filepath) {
  # Read in file, skipping first line
  issue <- read.delim(file = filepath, skip = 1, header = FALSE)
  # Label first three columns
  names(issue) <- c("date", "time", "temp")
  # Select those three columns
  issue <- issue %>%
    select(date, time, temp)
  
  # Done! Write it out
  write.table(issue, file = filepath,
              sep = "\t",
              row.names = FALSE)
}

#### sim_dat()---------------------------------------

# This quick function is meant to repeatedly simulate a model and get the number of bitter crab

sim_dat <- function(model, data, num_sims){
  
  sim_results <- matrix(nrow = num_sims, ncol = 3)
  names(sim_results) <- c("sim_num", "Total_Bitter", "pct_change_from_data")
  num_bitter <- sum(data$Bitter == 1)
  
  for (i in 1:num_sims){
    mod_res <- simulate(model)
    mod_res <- mod_res$sim_1[,1]
    sim_results[i, 1] <- i                     # Simulation number
    sim_results[i, 2] <- sum(mod_res)          # Total number of bitter crab
    sim_results[i, 3] <- (sum(mod_res) / nrow(data)) - (num_bitter / nrow(data))
  }
  colnames(sim_results) <- c("sim_num", "total_bitter", "pct_change_from_data")
  return(as.data.frame(sim_results))
}


#### pred_dat()
# This function is meant to repeatedly predict model output from the variables included
# and get the number of bitter crab. Uses

# model: A specific model you want to simulate
# data: the data with which that model was simulated (or new data that you want to test model accuracy on)
# num_sims: the number of simulations you want to run

pred_dat <- function(model, data, num_sims){
  
  # Set up empty matrix
  pred_results <- matrix(nrow = num_sims, ncol = 3)
  # Extract number of bitter crab in the actual data
  num_bitter <- sum(data$Bitter == 1)
  
  #### See what variables are in the model, pull into vector
  mod_vars <- as.character(model$call)[2]
  # Remove the Bitter part and random effects, since they'll be in all
  mod_vars <- str_split(mod_vars, pattern = fixed(" ~ "))[[1]][2]
  mod_vars <- str_split(mod_vars, pattern = fixed("+ (1 | Site) + (1 | s.Year)"))[[1]][1]
  # Split by the plus sign, remove spaces
  mod_vars <- str_split(mod_vars, pattern = fixed("+"))[[1]]
  mod_vars <- str_remove_all(mod_vars, pattern = " ")
  # Print variables in the model
  print(paste("Model", i, "variables are:"))
  print(mod_vars)
  
  # Choose all variables in model, put in new dataframe
  mod_vars <- names(data)[names(data) %in% mod_vars]
  mod_dat <- data %>%
    dplyr::select(c(Bitter, mod_vars, Site, s.Year))
  
  # Each run of this for loop is a different simulation
  for (i in 1:num_sims){
    mod_dat$Pred <- predict(model, new.data = mod_dat, type = "response")
    pred_results[i, 1] <- i                              # simulation number
    pred_results[i, 2] <- sum(mod_dat$Pred)              # Number of bitter crab in simulation
    pred_results[i, 3] <- sum(mod_dat$Pred) - num_bitter # Difference between simulated and true number of bitter crabs
  }
  colnames(pred_results) <- c("sim_num", "total_bitter", "pct_change_from_data")
  return(as.data.frame(pred_results))
}