# Datamanagement: Eurostat Example


# Setup----
rm(list = ls())
library(tidyverse)


# Read and explore the data ----

#' Read the data (eurostat_data.csv) and 
#' store them as eurost.



#' We only want to use the year (denoted by the variable 
#' time in the dataset) 2014 (time==2014). 
#' Please, filter accordingly.

eurost <- read_csv2("data/eurostat_data.csv")

eurost <- eurost %>%
  filter(time == 2014)



# Reproduce Plots ----


#' Reproduce these three plots. 
#' Below the graphs you can find some information.



## geom_point----

ggplot(
  data = eurost,
  mapping = aes(
    x = unemp_youth_t,
    y = gdp_gr,
    color = emigration_t / immigration_t
  )
) +
  geom_point(aes(size = inv_per_empl)) +
  labs(
    x = "Share of Unemployed Youth (15-24) in Pct.",
    y = "Real GDP growth rate (YOY)",
    title = "GDP growth and youth unemployment in 2014",
    subtitle = "Correlation between lower growth rate and higher youth unemployment",
    caption = "Source: Eurostat",
    size = "Investment p. person\n employed (in Mill. €)",
    color = "Ratio of Emigration \n to Immigration"
  ) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_classic()


#' * x = unemp_youth_t
#' * y = gdp_gr
#' * color = emmigration_t / immigration_t
#' * size = inv_per_empl
#' * Use theme_classic()
#' * You might realize that the label symbols of size are a bit 
#'    weird as soon as you add the regression line. This is because 
#'    the function to create the regression line interacts 
#'    with the size argument within the ggplot() function. 
#'    Therefore, as soon as you add the regression function, 
#'    move the size argument to geom_point. 
#'    But don't forget that the size argument has to be put within aes()!
#'   + Here is an important learning: aes() can be passed to either ggplot()
#'      or to a specific layer (e.g. geom_). Aesthetics specified to ggplot()
#'      are used as defaults for every layer, while aes() passed to a 
#'      specific layer a used as default for that layer only. 
#' * Furthermore, add this line 
#'    + scale_color_manual(values = c("red"), labels = c(" "))`. 
#'      Do you understand what it does?
#' 



## geom_col ----

ggplot(
  data = eurost,
  mapping = aes(x = geo_code, y = unemp_youth_t, fill = unemp_youth_t)
) +
  geom_col(width = 0.7) +
  geom_point(mapping = aes(y = unemp_workagepop_t, color = "red"),
             size = 3) +
  labs(
    title = "Unemployment levels of youth and total working age population",
    subtitle = "In most European countries youth unemployment is almost twice as big",
    x = "Countries",
    y = "Unemployment (in Pct.)",
    fill = "Youth unemployment",
    color = "Total unemployment",
    caption = "Source: Eurostat, Data from 2014"
  ) +
  scale_color_manual(values = c("red"), labels = c(" ")) +
  ggthemes::theme_economist()

#' * use theme_economist()
#' * x = geo_code
#' * y = unemp_youth_t
#' * fill = unemp_youth_t
#' * Play a bit with the widht of the bars.
#' * Crucially, we add points which relate to unemp_workagepop_t. 
#'    You need to write a new aes() in the point function and 
#'    assign the color "red". Outside of the aes() argument, a
#'    djust the size of the points to 3. 
#' 
#' * Rename the axes accordingly. Note that although 
#'   the y axis is actually just related to youth unemployment, 
#'   we simply rename it as Unemployment because the total 
#'   unemployment (e.g. the points) are within the same scale. 



## geom_hist ----


#' Here, we first do some some data management. 

eurost2 <- read_csv2("data/eurostat_data.csv") %>%
  filter(geo_code %in% c("DE", "IT", "EL", "ES", "UK"),
         time >= 1990,
         time <= 2015) %>%
  mutate(
    unemp_tod = if_else(time == 2015, unemp_workagepop_t, NA_real_),
    unemp_youth_tod = if_else(time == 2015, unemp_youth_t, NA_real_)
  )


ggplot(data = eurost2,
       mapping = aes(x = geo_code)) +
  geom_violin(mapping = aes(y = unemp_youth_t, fill = "red"),
              alpha = 0.5) +
  geom_violin(mapping = aes(y = unemp_workagepop_t, fill = "blue"),
              alpha = 0.5) +
  geom_point(aes(y = unemp_tod), color = "black", size = 3) +
  geom_point(aes(y = unemp_youth_tod), color = "black",  size = 3) +
  theme_minimal() +
  labs(
    title = "Unemployment levels of youth and total working age population \n Histogram of values between 1990 and 2015)",
    subtitle = "Germany is the only country (in comparison) where youth unemployment and \n total unemployment have moved within the same corridor historically",
    x = "Countries",
    y = "Unemployment (in Pct.)",
    fill = "Unemployment \n (2015 as point)",
    caption = "Source: Eurostat"
  ) +
  scale_fill_manual(
    values = c("red", "blue"),
    labels = c("Total Unemployment", "Youth Unemployment")
  )

#' * As aesthetic in the principle `ggplot()` function, 
#' just use x = geo_code
#' * What we are doing then is using two seperate `geom_violin` 
#'   and two seperate `geom_point` functions
#'   + Both violin functions use the argument `alpha = 0.5` 
#'     to inrease the transparency of the violin plots
#'   + One violin function has`y=unemp_youth_t` and 
#'     the other`y = unemp_workagepop_t`. Use the appropriate colors
#'   + One point function has `y = unemp_tod` and the other 
#'     `y = unemp_youth_tod`. Use `color = "black", size = 3` 
#'      outside of the aesthetic. 
#' * Use `theme_minimal()`
#' * Add this line: 
#'   `scale_fill_manual(values = c("red", "blue"), labels = c("Total Unemployment", "Youth Unemployment"))`



# New Stuff ----

#' All the exercises will use the first plot. 
#' To make our lifes easier, we safe this plot as main_plot
main_plot <- ggplot(
  data = eurost,
  mapping = aes(
    x = unemp_youth_t,
    y = gdp_gr,
    color = emigration_t / immigration_t
  )
) +
  geom_point(aes(size = inv_per_empl)) +
  labs(
    x = "Share of Unemployed Youth (15-24) in Pct.",
    y = "Real GDP growth rate (YOY)",
    title = "GDP growth and youth unemployment in 2014",
    subtitle = "Correlation between lower growth rate and higher youth unemployment",
    caption = "Source: Eurostat",
    size = "Investment p. person\n employed (in Mill. €)",
    color = "Ratio of Emigration \n to Immigration"
  ) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_classic()


main_plot



## Adjusting colors ----

main_plot+
  scale_color_gradient2(
    midpoint = 1,
    low = "blue",
    mid = "lightgrey",
    high = "red",
    space = "Lab"
  )

#' * Remember, we just saved the first plot as main_plot. 
#'   So you do not need to rewrite everything from this plot but 
#'   only ....
#' * In this case, use the function `scale_color_gradient2()` 
#'   to get the colors. 
#'   + Use `?scale_color_gradient2()` to understand the 
#'     arguments you need to use to replicate the plot



## Adding labels to points ----

main_plot +
  ggrepel::geom_text_repel(mapping = aes(label = geo_code))

#' * Again, use main_plot as the base
#' * The function from ggrepel we want to use is `geom_text_repel`.
#'   Use geo_code as the label within the `aes()` argument of `geom_text_repel`.



## Facets and more ----

main_plot+
  facet_wrap(~location)+
  labs(caption = "Source: Eurostat + location is manually defined by the site's creator")

#' * You can add a facets (e.g. the same plotting relationship 
#'  in many windows representing different variables such as 
#'  different years) with the function 
#'  `+facet_wrap(~FACETS_VARIABLE_NAME)`. Make facets using 
#'  our main graph, using löcation as a facets variable. 
#' * Use location as the facet variable.



# Interactive Graph ----
 
#' With the plotly package, we can actually build interactive graphs.
#' The easiest way is to simply use the `ggplotly()` function and 
#' parse a ggplot object to the `p` argument:

library(plotly)

ggplotly(p = main_plot)