# Setup the script

library(tidyverse)
library(eurostat)


# Employment rate by educational attainment levels ----

## Countries: **lfsa_ergaed** Employment rates by sex, age and educational attainment level ----
data_a <-
  get_eurostat(id = "lfsa_ergaed", # the relevant id to be downloaded
               time_format = "num") # retrieve time column as.numeric right away (bc we only have yearly data)

data_a <- label_eurostat(data_a,
                           code = c("geo",
                                    "isced11",
                                    "sex"), # we want to keep the "geo" plus relevant identifier items to be able to merge data easily
                           lang = "en") # English is the default, line included for transparency

data_a <- data_a %>%
  filter(age == "From 15 to 64 years",
         # filter for appropriate values --> if more variables (for example sex or age) are needed, make changes here!
         isced11_code %in% c("ED0-2", "ED3_4", "ED5-8")) %>%
  mutate(
    variab = "emp", # Add variable identifier
    isced11_code = str_replace(isced11_code, "-", "_"),
    # R doesn't like '-'(minus) in column names, change that to '_'
    key = paste(variab, isced11_code, sex_code, sep = "_")
  ) %>% # Merge sex and isced to get different data vars
  select(-c(unit, sex, isced11, sex_code, isced11_code, age, geo)) %>% # remove unnecessary rows (which would be in the way of spread)
  spread(key = key, value = values) %>% # spread out the column whose individual levels are needed as columns
  rename_all(tolower) # all column names in lowercase


## Belgium: **lfst_r_lfe2emprtn** Employment rates by sex, age, educational attainment level, citizenship and NUTS 2 regions ----
data_b <-
  get_eurostat(id = "lfst_r_lfe2emprtn", # the relevant id to be downloaded
               time_format = "num") # retrieve time column as.numeric right away (bc we only have yearly data)


data_b <- label_eurostat(
  data_b,
  code = c("geo",
           "isced11",
           "sex"),
  # we want to keep the "geo" and "na_item" column to be able to merge data easily
  lang = "en",
  # English is the default, line included for transparency
  fix_duplicated = TRUE
)

data_b <- data_b %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    citizen == "Total", # geo_code and citizen are the only two differences in Belgium compared to Country level for the filter
    age == "From 15 to 64 years",
    # filter for appropriate values --> if more variables (for example sex or age) are needed, make changes here!
    isced11_code %in% c("ED0-2", "ED3_4", "ED5-8")
  ) %>%
  mutate(
    variab = "emp", # Add variable identifier
    isced11_code = str_replace(isced11_code, "-", "_"),
    # R doesn't like '-'(minus) in column names, change that to '_'
    key = paste(variab, isced11_code, sex_code, sep = "_")
  ) %>% # Merge sex and isced to get different data vars
  select(-c(unit, sex, isced11, sex_code, isced11_code, age, geo, citizen)) %>% # remove unnecessary rows (which would be in the way of spread) + Do not forget also to unfilter citizen (extra variable in the Belgium data)
  spread(key = key, value = values) %>% # spread out the column whose individual levels are needed as columns
  rename_all(tolower) # all column names in lowercase

## Rbind a and b ----
empl_educ <- bind_rows(data_a, data_b)


# A. Employment rate by sex and age ----

## Countries: **lfsa_ergaed** Employment rates by sex, age and educational attainment level ---- 

## Belgium: **lfst_r_lfe2emprtn** Employment rates by sex, age, educational attainment level, citizenship and NUTS 2 regions ----

## Rbind a and b: empl_educ ----


# B. Unemployment rates ----

## Unemployment rates by sex, age and citizenship (%) [lfsa_urgan] ----

## Unemployment rates by sex, age, ((, country of birth)) and NUTS 2 regions (%) [lfst_r_lfu3rt AND lfst_r_lfur2gac] ----

#' There is only one single data management task for you here: 
#' * Somewhat randomly, Unemployment data for Youth and working age population exist 
#' but in different datasets:
#'   + lfst_r_lfur2gac has 15-64
#*   + lfst_r_lfu3rt has 15-24
#' * Accordingly, we load and manipulate both data seperately and merge them


## Rbind a and b: unempl ----


# C. Nominal GDP per capita ----

## Gross domestic product at market prices tec00001 ----

## Gross domestic product (GDP) at current market prices by NUTS 2 regions [nama_10r_2gdp] ----

## Rbind a and b gdp.pc


# D. Investment Rate ----

## Annual enterprise statistics for special aggregates of activities (NACE Rev. 2) [sbs_na_sca_r2] ----
#' * Only Data for country level
#' * getting data for the variables:
#'   + Investment rate (investment/value added at factors cost) - percentage
#'   + Investment per person employed - milliers deuros

## Only data for country level available

## Rbind a and b: Just save data_a as inv ---- 


# Difficult: Employment Growth  ----

## Employment and activity by sex and age  - annual data  [lfsi_emp_a] ----

## Employment by sex, age and NUTS 2 regions (1 000) [lfst_r_lfe2emp] ----

## Rbind a and b:empl_growth


# F. Merge all df together and save as csv ----
df_final <-
  Reduce(
    function(x, y)
      full_join(x, y, by = c("geo_code", "time")),
    list(empl_educ, unempl, gdp_gr, empl_growth)
  )

df_final <- df_final %>%
  group_by(geo_code, time) %>%
  rename_all(tolower)

write_excel_csv2(df_final, "data_uptodate.csv")