data_analysis_adid2010_data_ema <- function(overwrite = FALSE) { set.seed(42) # find root directory root <- rprojroot::is_rstudio_project output <- root$find_file( ".setup", "data-raw", "adid2010-data-ema.Rds" ) raw_data <- root$find_file( ".setup", "data-raw", "adid2010-ema.txt" ) if (!file.exists(output)) { write <- TRUE } else { if (overwrite) { write <- TRUE } else { write <- FALSE } } if (!file.exists(raw_data)) { write <- FALSE } if (write) { cat("\ndata_analysis_adid2010_data_ema\n") Sys.setenv( OMP_NUM_THREADS = "1", MKL_NUM_THREADS = "1", OPENBLAS_NUM_THREADS = "1" ) data <- read.csv( file = root$find_file( ".setup", "data-raw", "adid2010-ema.txt" ) ) # subset the data data <- data[ , c("id", "sdate", "stime", "allneg", "allpos"), drop = FALSE ] # replace -999 with NA is_num <- vapply( X = data, FUN = is.numeric, FUN.VALUE = logical(1) ) data[is_num] <- lapply( X = data[is_num], FUN = function(x) { x[x == -999] <- NA x } ) is_chr <- vapply( X = data, FUN = is.character, FUN.VALUE = logical(1) ) data[is_chr] <- lapply( X = data[is_chr], FUN = function(x) { x[x == "-999"] <- NA x } ) # create clock time # date: "10/29/08" (mm/dd/yy) time: 6.54 (decimal hours since midnight) makeClockTime <- function(date, time, tz = "America/New_York") { date_chr <- trimws(as.character(date)) d <- suppressWarnings( as.Date( date_chr, tryFormats = c("%m/%d/%y", "%m/%d/%Y", "%Y-%m-%d") ) ) t <- suppressWarnings(as.numeric(time)) out <- as.POSIXct(rep(NA_character_, length(d)), tz = tz) ok <- !is.na(d) & !is.na(t) if (any(ok)) { secs <- round(t[ok] * 3600) # decimal hours -> seconds # Build a naive timestamp string, then "attach" tz without shifting it base <- format(d[ok], "%Y-%m-%d") h <- secs %/% 3600 m <- (secs %% 3600) %/% 60 s <- secs %% 60 stamp <- sprintf("%s %02d:%02d:%02d", base, h, m, s) out[ok] <- as.POSIXct(stamp, tz = tz, format = "%Y-%m-%d %H:%M:%S") } out } data$clock_time <- makeClockTime(data$sdate, data$stime) # create discrete time rounded to the nearest hour ct_num <- as.numeric(data$clock_time) data$hour_time <- as.POSIXct( round(ct_num / 3600) * 3600, origin = "1970-01-01", tz = "America/New_York" ) # insert missing hours within each id data <- data[order(data$id, data$hour_time, data$clock_time), ] key <- paste(data$id, data$hour_time) data <- data[!duplicated(key), ] # insert missing hours within each id data <- data[order(data$id, data$hour_time, data$clock_time), ] key <- paste(data$id, data$hour_time) data <- data[!duplicated(key), ] ids <- unique(data$id) completed_list <- vector("list", length(ids)) names(completed_list) <- ids for (i in seq_along(ids)) { this_id <- ids[i] dsub <- data[data$id == this_id, , drop = FALSE] if (all(is.na(dsub$hour_time))) { completed_list[[i]] <- dsub next } tmin <- min(dsub$hour_time, na.rm = TRUE) tmax <- max(dsub$hour_time, na.rm = TRUE) completed_list[[i]] <- data.frame( id = this_id, hour_time = seq(from = tmin, to = tmax, by = "hour"), stringsAsFactors = FALSE ) } grid_all <- do.call(rbind, completed_list) data <- merge( grid_all, data, by = c("id", "hour_time"), all.x = TRUE, sort = FALSE ) # make sure that the first measurement is no missing data <- lapply( X = unique(data[, "id"]), FUN = function(id) { i <- data[which(data[, "id"] == id), , drop = FALSE] test <- any( is.na( i[1, c("allneg", "allpos")] ) ) while (test) { i <- i[-1, ] test <- any( is.na( i[1, c("allneg", "allpos")] ) ) } i <- i[order(i[, "hour_time"]), ] idx <- seq_len(dim(i)[1]) - 1 cbind( i, idx ) } ) # linear detrend DetrendSeries <- function(y, x, keep_mean = TRUE) { ok <- !is.na(y) & !is.na(x) if (sum(ok) < 2L || length(unique(x[ok])) < 2L) { return(y) } fit <- stats::lm(y ~ x, subset = ok) pred <- stats::predict(fit, newdata = data.frame(x = x)) res <- y - pred if (keep_mean) res + mean(y[ok]) else res } data <- lapply( X = data, FUN = function(i) { na_dt <- DetrendSeries( y = i[, "allneg"], x = i[, "idx"] ) pa_dt <- DetrendSeries( y = i[, "allpos"], x = i[, "idx"] ) i <- cbind( i, na_dt = na_dt, pa_dt = pa_dt ) i <- i[, c("id", "idx", "na_dt", "pa_dt")] rownames(i) <- NULL colnames(i) <- c( "id", "time", "na", "pa" ) i } ) # data frame data <- as.data.frame( do.call( what = "rbind", args = data ) ) data <- data[ !( data$id %in% c( 1009, 1051, 1074, 1126, 1127, 1137, 1143, 1173, 1184, 1196, 1203, 1229, 1257, 1273, 1278, 1285, 1288, 1302, 1308, 1328 ) ), ] # save saveRDS( object = data, file = output ) } } data_analysis_adid2010_data_ema() rm(data_analysis_adid2010_data_ema)