# Setup the script

library(tidyverse)
library(eurostat)


# Employment rate by educational attainment levels ----

## Countries: **lfsa_ergaed** Employment rates by sex, age and educational attainment level ----
data_a <-
  get_eurostat(id = "lfsa_ergaed", # the relevant id to be downloaded
               time_format = "num") # retrieve time column as.numeric right away (bc we only have yearly data)

data_a <- label_eurostat(data_a,
                           code = c("geo",
                                    "isced11",
                                    "sex"), # we want to keep the "geo" plus relevant identifier items to be able to merge data easily
                           lang = "en") # English is the default, line included for transparency

data_a <- data_a %>%
  filter(age == "From 15 to 64 years",
         # filter for appropriate values --> if more variables (for example sex or age) are needed, make changes here!
         isced11_code %in% c("ED0-2", "ED3_4", "ED5-8")) %>%
  mutate(
    variab = "emp", # Add variable identifier
    isced11_code = str_replace(isced11_code, "-", "_"),
    # R doesn't like '-'(minus) in column names, change that to '_'
    key = paste(variab, isced11_code, sex_code, sep = "_")
  ) %>% # Merge sex and isced to get different data vars
  select(-c(unit, sex, isced11, sex_code, isced11_code, age, geo)) %>% # remove unnecessary rows (which would be in the way of spread)
  spread(key = key, value = values) %>% # spread out the column whose individual levels are needed as columns
  rename_all(tolower) # all column names in lowercase



## Belgium: **lfst_r_lfe2emprtn** Employment rates by sex, age, educational attainment level, citizenship and NUTS 2 regions ----
data_b <-
  get_eurostat(id = "lfst_r_lfe2emprtn", # the relevant id to be downloaded
               time_format = "num") # retrieve time column as.numeric right away (bc we only have yearly data)


data_b <- label_eurostat(
  data_b,
  code = c("geo",
           "isced11",
           "sex"),
  # we want to keep the "geo" and "na_item" column to be able to merge data easily
  lang = "en",
  # English is the default, line included for transparency
  fix_duplicated = TRUE
)

data_b <- data_b %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    citizen == "Total", # geo_code and citizen are the only two differences in Belgium compared to Country level for the filter
    age == "From 15 to 64 years",
    # filter for appropriate values --> if more variables (for example sex or age) are needed, make changes here!
    isced11_code %in% c("ED0-2", "ED3_4", "ED5-8")
  ) %>%
  mutate(
    variab = "emp", # Add variable identifier
    isced11_code = str_replace(isced11_code, "-", "_"),
    # R doesn't like '-'(minus) in column names, change that to '_'
    key = paste(variab, isced11_code, sex_code, sep = "_")
  ) %>% # Merge sex and isced to get different data vars
  select(-c(unit, sex, isced11, sex_code, isced11_code, age, geo, citizen)) %>% # remove unnecessary rows (which would be in the way of spread) + Do not forget also to unfilter citizen (extra variable in the Belgium data)
  spread(key = key, value = values) %>% # spread out the column whose individual levels are needed as columns
  rename_all(tolower) # all column names in lowercase

## Rbind a and b ----
empl_educ <- bind_rows(data_a, data_b)


# A. Employment rate by sex and age ----

## Countries: **lfsa_ergaed** Employment rates by sex, age and educational attainment level ----

data_a <-
  get_eurostat(id = "lfsa_ergaed",
               time_format = "num")

data_a <- label_eurostat(data_a,
                         code = c("geo",
                                  "sex",
                                  "age"))

data_a <- data_a %>%
  filter(age %in% c("From 15 to 64 years", "From 15 to 24 years"),
         isced11 == "All ISCED 2011 levels") %>%
  mutate(
    variab = "emp",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  spread(key = key, value = values) %>%
  rename_all(tolower)

## Belgium: **lfst_r_lfe2emprtn** Employment rates by sex, age, educational attainment level, citizenship and NUTS 2 regions----
data_b <-
  get_eurostat(id = "lfst_r_lfe2emprtn",
               time_format = "num")

data_b <- label_eurostat(data_b,
                         code = c("geo",
                                  "sex",
                                  "age"),
                         fix_duplicated = TRUE) 

data_b <- data_b %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    age %in% c("From 15 to 64 years", "From 15 to 24 years"), # only data on working age population available
    citizen == "Total",
    isced11 == "All ISCED 2011 levels"
  ) %>%
  mutate(
    variab = "emp",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  spread(key = key, value = values) %>%
  rename_all(tolower) %>%
  mutate(emp_youth_f = NA,
         emp_youth_m = NA,
         emp_youth_t = NA)


## Rbind a and b: empl_educ ----
empl_educ <- rbind(data_a, data_b)


# B. Unemployment rates ----

## Unemployment rates by sex, age and citizenship (%) [lfsa_urgan] ----
data_a <-
  get_eurostat(id = "lfsa_urgan",
               time_format = "num")

data_a <- label_eurostat(data_a,
                         code = c("geo",
                                  "sex",
                                  "age"))

data_a <- data_a %>%
  filter(age %in% c("From 15 to 64 years", "From 15 to 24 years"),
         citizen == "Total") %>%
  mutate(
    variab = "unemp",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  spread(key = key, value = values) %>%
  rename_all(tolower)


## Unemployment rates by sex, age, ((, country of birth)) and NUTS 2 regions (%) [lfst_r_lfu3rt AND lfst_r_lfur2gac]----

data_b1 <-
  get_eurostat(id = "lfst_r_lfur2gac",
               time_format = "num")

data_b1 <- label_eurostat(data_b1,
                          code = c("geo",
                                   "sex",
                                   "age"),
                          fix_duplicated = TRUE) 

data_b1 <- data_b1 %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    age %in% c("From 15 to 64 years", "From 15 to 24 years"),
    c_birth == "Total"
  ) %>% # lfst_r_lfur2gac has 15-64
  mutate(
    variab = "unemp",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  spread(key = key, value = values) %>%
  rename_all(tolower)


# lfst_r_lfu3rt

data_b2 <-
  get_eurostat(id = "lfst_r_lfu3rt",
               time_format = "num")

data_b2 <- label_eurostat(data_b2,
                          code = c("geo",
                                   "sex",
                                   "age"),
                          fix_duplicated = TRUE) 

data_b2 <- data_b2 %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    age %in% c("From 15 to 64 years", "From 15 to 24 years")) %>% # lfst_r_lfur2gac has 15-64
  mutate(
    variab = "unemp",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>% 
  select(geo_code, key, time, values) %>% 
  spread(key = key, value = values) %>% 
  rename_all(tolower)

# Merge youth and working age pop
data_b <- merge(data_b1, data_b2, by=c("geo_code", "time"))

## Rbind a and b----
unempl <- rbind(data_a, data_b)

# C. Nominal GDP per capita ----

## Gross domestic product at market prices tec00001----
data_a <-
  get_eurostat(id = "tec00001",
               time_format = "num")

data_a <- label_eurostat(data_a,
                         code = c("geo"))

data_a <- data_a %>%
  filter(unit=="Current prices, euro per capita") %>%
  mutate(gdp_pc = values) %>%
  select(geo_code, time, gdp_pc) 


## Gross domestic product (GDP) at current market prices by NUTS 2 regions [nama_10r_2gdp]----
data_b <-
  get_eurostat(id = "nama_10r_2gdp",
               time_format = "num")

data_b <- label_eurostat(data_b,
                         code = c("geo"), 
                         fix_duplicated = TRUE)

data_b <- data_b %>%
  filter(unit == "Euro per inhabitant",
         geo_code %in% c("BE1", "BE2", "BE3")) %>%
  mutate(gdp_pc = values) %>%
  select(geo_code, time, gdp_pc)

## Rbind a and b----
gdp_pc <- rbind(data_a, data_b)

# D. Investment Rate ----

## Annual enterprise statistics for special aggregates of activities (NACE Rev. 2) [sbs_na_sca_r2] ----
data_a <-
  get_eurostat(id = "sbs_na_sca_r2",
               time_format = "num")

data_a <- label_eurostat(data_a,
                         code = c("geo"),
                         fix_duplicated = TRUE)

data_a <- data_a %>%
  filter(
    nace_r2 ==
      "Total business economy; repair of computers, personal and household goods; except financial and insurance activities",
    indic_sb %in% c(
      "Investment rate (investment/value added at factors cost) - percentage",
      "Investment per person employed - milliers d'euros"
    )
  ) %>%
  mutate(
    key = ifelse(
      indic_sb == "Investment per person employed - milliers d'euros",
      "inv_per_empl",
      "inv_total"
    )
  ) %>%
  select(-c(nace_r2, indic_sb, geo)) %>%
  spread(key = key, value = values) %>%
  select(geo_code, time, inv_per_empl, inv_total) 

## Rbind a and b----
inv <- data_a


# E. Difficult: Employment Growth ----

data_a <-
  get_eurostat(id = "lfsi_emp_a",
               time_format = "num")

data_a <- label_eurostat(data_a,
                         code = c("geo",
                                  "sex",
                                  "age"))

data_a <- data_a %>%
  filter(
    age %in% c("From 15 to 64 years", "From 15 to 24 years"),
    indic_em == "Total employment (resident population concept - LFS)",
    unit == "Thousand persons"
  ) %>%
  mutate(
    values = values * 1000,
    # Working Population values are in 1000
    variab = "workingpop",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  group_by(geo_code, key) %>%
  mutate(lag_value = lag(values),
         pct_change = (values-lag_value)/lag_value) %>%
  select(geo_code, time, key, pct_change) %>%
  spread(key=key, value=pct_change) %>%
  rename_all(tolower)



## Employment by sex, age and NUTS 2 regions (1 000) [lfst_r_lfe2emp]----

data_b <-
  get_eurostat(id = "lfst_r_lfe2emp",
               time_format = "num")

data_b <- label_eurostat(data_b,
                         code = c("geo",
                                  "sex",
                                  "age"),
                         fix_duplicated = TRUE)

data_b <- data_b %>%
  filter(
    geo_code %in% c("BE1", "BE2", "BE3"),
    age %in% c("From 15 to 64 years", "From 15 to 24 years")
  ) %>%
  mutate(
    values = values * 1000,
    # Working Population values are in 1000
    variab = "workingpop",
    variab2 = ifelse(age_code == "Y15-64", "workagepop", "youth"),
    key = paste(variab, variab2, sex_code, sep = "_")
  ) %>%
  select(geo_code, key, time, values) %>%
  group_by(geo_code, key) %>%
  mutate(lag_value = lag(values),
         pct_change = (values-lag_value)/lag_value) %>%
  select(geo_code, time, key, pct_change) %>%
  spread(key=key, value=pct_change) %>%
  rename_all(tolower)

## Rbind a and b----
empl_growth <- rbind(data_a, data_b)

# F. Merge all df together and save as csv ----
df_final <-
  Reduce(
    function(x, y)
      full_join(x, y, by = c("geo_code", "time")),
    list(empl_educ, unempl, gdp_gr, empl_growth)
  )

df_final <- df_final %>%
  group_by(geo_code, time) %>%
  rename_all(tolower)

write_excel_csv2(df_final, "data_uptodate.csv")