#Taller Machine Learning-Modelaje Predictivo#
#Jesús B. Zerpa|Economist & Data Scientist|#
#Código en R#
#Se instalan las siguientes paqueterías#

install.packages("workflows")
install.packages("parsnip")
install.packages("recipes")
install.packages("yardstick")
install.packages("glmnet")
install.packages("tidyverse")
install.packages("tidyquant")
install.packages("timetk")

#Se cargan en la consola de Rstudio"

library(workflows)
library(parsnip)
library(recipes)
library(yardstick)
library(glmnet)
library(tidyverse)
library(tidyquant)
library(timetk)

# Se cargan los datos y generamos un objeto

bikes <- read_csv("C:/Users/Personal/Desktop/Modelo en R/Arima manufactura/Prueba/day.csv")

# Seleccionamos las columas de fecha y valor
bikes_tbl <- bikes %>%
    select(dteday, cnt) %>%
    rename(date  = dteday,
           value = cnt)
                      
#Gráfica de datos y regiones de entrenamiento y prueba#

bikes_tbl %>%
    ggplot(aes(x = date, y = value)) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_light()[[4]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_light()[[1]], label = "Región de entrenamiento") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_light()[[1]], label = "Región de Prueba") +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    labs(title = "Bikes Sharing Dataset: Escala Diaria", x = "") +
    theme_tq()

#Dividir los datos en conjuntos de entenamiento y prueba#

train_tbl <- bikes_tbl %>% filter(date < ymd("2012-07-01"))
test_tbl  <- bikes_tbl %>% filter(date >= ymd("2012-07-01"))

# Conjunto de entrenamiento#

train_tbl

#Pasos de preprocesamiento de datos#

recipe_spec_final <- recipe_spec_timeseries %>%
    step_rm(date) %>%
    step_rm(contains("iso"), contains("minute"), contains("hour"),
            contains("am.pm"), contains("xts")) %>%
    step_normalize(contains("index.num"), date_year) %>%
    step_dummy(contains("lbl"), one_hot = TRUE) %>%
    step_ns(date_index.num, deg_free = 3)

bake(prep(recipe_spec_final), new_data = train_tbl)


#Agregando especificaciones de series de tiempo#

recipe_spec_timeseries <- recipe(value ~ ., data = train_tbl) %>%
    step_timeseries_signature(date) 
    
#especificación del modelo (lineal)#

model_spec_glmnet <- linear_reg(mode = "regression", penalty = 0.001, mixture = 0.5) %>%
    set_engine("glmnet")
    
# Generando flujo de trabajo#

workflow_glmnet <- workflow() %>%
    add_recipe(recipe_spec_final) %>%
    add_model(model_spec_glmnet)

workflow_glmnet

# Entrenamiento función fit() #

workflow_trained <- workflow_glmnet %>% fit(data = train_tbl)

#Prueba (validación)#

prediction_tbl <- workflow_trained %>% 
    predict(test_tbl) %>%
    bind_cols(test_tbl) 

prediction_tbl

#Visualización de resultados con ggplot#

ggplot(aes(x = date), data = bikes_tbl) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_light()[[4]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_light()[[1]], label = "Región de entrenamiento") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_light()[[1]], label = "Región de Prueba") + 
    geom_point(aes(x = date, y = value),  
               alpha = 0.5, color = palette_light()[[1]]) +
    # Add predictions
    geom_point(aes(x = date, y = .pred), data = prediction_tbl, 
               alpha = 0.5, color = palette_light()[[2]]) +
    theme_tq() 
    
# Errores#

prediction_tbl %>% metrics(value, .pred)    

#Gráfica de residuos#

prediction_tbl %>%
    ggplot(aes(x = date, y = value - .pred)) +
    geom_hline(yintercept = 0, color = "red") +
    geom_point(color = palette_light()[[1]], alpha = 0.5) +
    geom_smooth() +
    theme_tq() +
    labs(title = "Test Set: GLM Residuos del Modelo", x = "") +
    scale_y_continuous(limits = c(-5000, 5000))
    
#Extraer índice#
idx <- bikes_tbl %>% tk_index()

#Resumen descriptivo#
bikes_summary <- idx %>% tk_get_timeseries_summary()

#Parámetros#
bikes_summary[1:6]

#Resumen#

bikes_summary[7:12]

#Aplicación#
idx_future <- idx %>% tk_make_future_timeseries(n_future = 180)

future_tbl <- tibble(date = idx_future) 

future_tbl

#Reentrenamiento#
future_predictions_tbl <- workflow_glmnet %>% 
    fit(data = bikes_tbl) %>%
    predict(future_tbl) %>%
    bind_cols(future_tbl)
    
#Gráfica del modelo#

bikes_tbl %>%
    ggplot(aes(x = date, y = value)) +
    geom_rect(xmin = as.numeric(ymd("2012-07-01")),
              xmax = as.numeric(ymd("2013-01-01")),
              ymin = 0, ymax = 10000,
              fill = palette_light()[[4]], alpha = 0.01) +
    geom_rect(xmin = as.numeric(ymd("2013-01-01")),
              xmax = as.numeric(ymd("2013-07-01")),
              ymin = 0, ymax = 10000,
              fill = palette_light()[[3]], alpha = 0.01) +
    annotate("text", x = ymd("2011-10-01"), y = 7800,
             color = palette_light()[[1]], label = "Región entrenamiento") +
    annotate("text", x = ymd("2012-10-01"), y = 1550,
             color = palette_light()[[1]], label = "Región Prueba") +
    annotate("text", x = ymd("2013-4-01"), y = 1550,
             color = palette_light()[[1]], label = "Región Pronóstico") +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    # future data
    geom_point(aes(x = date, y = .pred), data = future_predictions_tbl,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = date, y = .pred), data = future_predictions_tbl,
                method = 'loess') + 
    labs(title = "Bikes Sharing Dataset: 6 meses de pronóstico", x = "") +
    theme_tq()

# Cálculo de la desviación estándard de los residuos#

test_resid_sd <- prediction_tbl %>%
    summarize(stdev = sd(value - .pred))

future_predictions_tbl <- future_predictions_tbl %>%
    mutate(
        lo.95 = .pred - 1.96 * test_resid_sd$stdev,
        lo.80 = .pred - 1.28 * test_resid_sd$stdev,
        hi.80 = .pred + 1.28 * test_resid_sd$stdev,
        hi.95 = .pred + 1.96 * test_resid_sd$stdev
    )
    
# Pronósticos con intérvalos de predicción#

bikes_tbl %>%
    ggplot(aes(x = date, y = value)) +
    geom_point(alpha = 0.5, color = palette_light()[[1]]) +
    geom_ribbon(aes(y = .pred, ymin = lo.95, ymax = hi.95), 
                data = future_predictions_tbl, 
                fill = "#D5DBFF", color = NA, size = 0) +
    geom_ribbon(aes(y = .pred, ymin = lo.80, ymax = hi.80, fill = key), 
                data = future_predictions_tbl,
                fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
    geom_point(aes(x = date, y = .pred), data = future_predictions_tbl,
               alpha = 0.5, color = palette_light()[[2]]) +
    geom_smooth(aes(x = date, y = .pred), data = future_predictions_tbl,
                method = 'loess', color = "white") + 
    labs(title = "Bikes Sharing Dataset: 6 meses de pronósticos con intérvalos de predicción", x = "") +
    theme_tq()