# Packages and script set-up ---- 

rm(list = ls())
# install.packages("tidyverse")
library(tidyverse)

# Loading the dataset ----

## .csv ----
 
#'Complete the empty spaces! (soep_europ.csv)
data <- read.csv("data/soep_europ.csv") # Looks weird....

#' To read the data correctly, we have to add the argument "sep" to the function.
data <- read.csv("data/soep_europ.csv", sep=";") #

 
#' 2 Addenda: 

#'1. Try out to to read a US style csv with soep_us.csv**

data <- read.csv("data/soep_us.csv")
  
#' 2. An easier way to read European style csv files is to use read_csv2() from the tidyverse. 
#'   (Accordingly, read_csv() is for US style csv files)

data <- read_csv2("data/soep_europ.csv")



## .dta / .sav / excel ----

data <- haven::read_dta("data/soep.dta")

data <- haven::read_sav("data/soep.sav")

data <- readxl::read_xlsx("data/soep.xlsx")

data <- read_csv2("data/soep_europ.csv")


# Basic Data Management ----

## The pipe %>% ----

mean(c(1:5))
# can be expressed with the pipe as
c(1:5) %>%
  mean()

## Select the relevant variables ----

#' Accordingly, we `select` the relevant ones:
#' hhnr, persnr, gebjahr, sex, tp72 (Work Overtime), tp7001 (Contracted Working Hours), 
#' tp7003 (Hours Per Week Actual), tp7602 (Net Income Last Month)
#' 

data <- data %>% # remember the pipe?
  select(hhnr, persnr, gebjahr, sex, tp72, tp7001, 
         tp7003 , tp0301, tp0302, tp7602) # fill in the missing names

 
## Explore the data structure ----

#' For a first overview over our data, we can use the base functions 
#' (e.g. those which are without any package installations part of R) 
#' str(), head(), summary(), table(), quantile(), and View().

str(data)
head(data)
summary(data)
table(data$tp72) # you could use any other variable here.
quantile(data$tp72, na.rm = TRUE)
View(data)

## Basic recoding and mutate() ----

#' The dataset is from 2003. Compute the age accordingly by creating the new variable age
data$age <- 2003 - data$gebjahr
 
#' There is also a dplyr way to do this: mutate(). 

data <- data %>%
   mutate(age = 2003 - gebjahr)

## Filtering ----

#' Let's say, we only want to keep those observations which are at least 18.
data <- data %>%
  filter(age >= 18)

## Arrange ----
#' Sometimes, we want to order data. 
#' We do so with arrange(). desc(x) sorts x in descending order. 

data <- data %>%
  arrange(desc(age))
# 

## All in one ----
#' By the way, we could have done all of this in one batch:

data <- read_csv2("data/soep_europ.csv") # read data again.

data <- data %>%
  select(hhnr, persnr, gebjahr, sex, tp72, tp7001, tp7003 , tp0301, tp0302, tp7602) %>%
  mutate(age = 2003 - gebjahr) %>%
  filter(age >= 18) %>%
  arrange(desc(age))




# Advanced stuff ----

## More mutate ----

#' The benefit of mutate() is that you can easily apply more than one 
#' operation (on the same variable) at once in a tidy way. For example, let's say we 
#' * want to recode the binary variable tp72 (Work Overtime) into three levels: 
#'   + 10 Yes
#'   + 20 No
#'   + 0 Everything else
#' * Apply those changes to tp72 by creating the new variable over
#' * Do something similar with tp7602 (recreate as netinc, change specific values to NA)
 

data <- data %>%
  mutate(
    over = recode(
      tp72,
      "3" = 0, #actually means Does not apply: Self-Employment
      "-2" = 0, # actually means does not apply
      "-1" = 0, # actually means no answer
      .default = tp72
    ),
    over = over * 10,
    netinc = recode(
      tp7602,
      "-3" = NA_real_,
      "-2" = NA_real_,
      "-1" = NA_real_,
      .default = tp7602
    )
  )


 
#' Let's check whether the data manipulation worked 

table(data$over)

table(data$over,
       useNA = "always") # we add the argument useNA in order to also get the number of NA values.


## Factorize ----

#' As we have practiced before, we can factorize variables. 
#' This is also possible within the dplyr world (mutate() again). Let's do that for sex.

data <- data  %>%
  mutate(sex = factor(
    sex ,
    levels = c(1, 2),
    labels = c("male", "female")
  ))

## Recode II ----
 

#' * tp7001(Contracted Working Hours): mutate to column "contract", 
#' change all values below 0 (errors) to NA, and divide the variable by 10
#' * Do the same for tp7003: Actual working hours


data <- data %>%
  mutate(
    contract =
      if_else(tp7001 < 0, NA_real_, tp7001),
    actual =
      if_else(tp7003 < 0, NA_real_, tp7003),
    contract = contract / 10,
    actual = actual / 10
  )


## Multiple if_else() statements ---- 

#' Let's say, we want to build a character variable inc.quant which indicates 
#' the quantile() (see above) of netinc. 
 
data <- data %>%
  mutate(
    inc.quant = case_when(
      netinc < quantile(netinc, na.rm = TRUE)[2] ~ "Q1",
      
      netinc >= quantile(netinc, na.rm = TRUE)[2] &
        netinc < quantile(netinc, na.rm = TRUE)[3] ~ "Q2",
      
      netinc >= quantile(netinc, na.rm = TRUE)[3] &
        netinc < quantile(netinc, na.rm = TRUE)[4] ~ "Q3",
      
      netinc >= quantile(netinc, na.rm = TRUE)[4] ~ "Q4"
    )
  )

## Check data.frame again ----
head(data)
summary(data)

## All in one II ----

#' Again, we could have done all of this in one batch - but it would have been untidy... 
data <- read_csv2("data/soep_europ.csv")

data <- data %>% # remember the pipe?
  
  select(hhnr, persnr, gebjahr, sex, tp72, tp7001, tp7003 , tp0301, tp0302, tp7602)%>%
  
  mutate(age = 2003-gebjahr) %>%
  
  filter(age>=18) %>%
  
  arrange(desc(age))%>%
  mutate(
         over = recode(
           tp72,
           "3" = 0, #actually means Does not apply: Self-Employment
           "-2" = 0, # actually means does not apply
           "-1" = 0, # actually means no answer
           .default = tp72
          ),
         over = over * 10,
         netinc = recode(tp7602,
                         "-3" = NA_real_,
                         "-2" = NA_real_,
                         "-1" = NA_real_
          ),
         
         sex = factor(
           sex ,
           levels = c(1, 2),
           labels = c("male", "female")
         ),    
         
         contract =
           if_else(tp7001 < 0, NA_real_, tp7001),
         actual =
           if_else(tp7003 < 0, NA_real_, tp7003),
         contract = contract / 10,
         actual = actual / 10,

         inc.quant = case_when(
           netinc < quantile(netinc, na.rm = TRUE)[2] ~ "Q1",
           netinc >= quantile(netinc, na.rm = TRUE)[2] &
             netinc < quantile(netinc, na.rm = TRUE)[3] ~ "Q2",
           netinc >= quantile(netinc, na.rm = TRUE)[3] &
             netinc < quantile(netinc, na.rm = TRUE)[4] ~ "Q3",
           netinc >= quantile(netinc, na.rm = TRUE)[4] ~ "Q4"
         )
  )


# (Easy) additional stuff ----

## Summarize ----

data %>%
  summarize(mean = mean(age, na.rm = TRUE),
            sd = sd(age, na.rm = TRUE))

## Grouping ----

#' The same is also possible for grouped structures. 
#' Say, for example, you would want to calculate separate values for different genders: 

data %>%
  group_by(sex) %>%
  summarise(mean = mean(age, na.rm = TRUE),
            sd = sd(age, na.rm = TRUE))


#' A bit more nuanced, let's say you want to create a variable cohort.deviance 
#' which gives you the deviation in netinc from the mean of all people of the same age


data <- data %>%
  group_by(age) %>%
  mutate(cohort.deviance = netinc - mean(netinc, na.rm = TRUE)) 

## Filtering II ----

#' Let's say, you only want to keep those observations which are older than 18 
#' and who are in the top three income quintiles. We store this in data2


data2 <- data %>%
  filter(age > 18,
         inc.quant %in% c("Q2", "Q3", "Q4"),
         sex == "male")

## Merge ----

#' We reread the SOEP data under a different name, merge the relevant 
#' variables from this data.frame with the one we have been working with all the time and 
#' create a new data.frame (data2). 

data.origin <- read_csv2("data/soep_europ.csv")

data2 <- data %>%
  left_join(data.origin[, c("hhnr", "tp0101")], by = c("hhnr"))



data <- data %>%
  left_join(data.origin[, c("hhnr", "tp0101", "persnr")], by = c("hhnr", "persnr"))

## Spread and Gather ----

#' At the moment, our data is in wide format (e.g. all the variables are in 
#' separate columns). Suppose, we would want to make the data long:


data.long <- data %>%
  gather(key = key, value = value,-c(hhnr, persnr))


#' Can you make data.long wide again?
data.wide <- data.long %>%
  spread(key = key, value = value)