#' Install needed packages
dl_repo <- "http://cran.rstudio.org"
install.packages("tidyverse", repos = dl_repo)
install.packages("googlesheets", repos = dl_repo)

#' Load packages
library(googlesheets)
library(tidyr)
library(dplyr)
library(readr)

#' Links from Gapminder data page
##' Population - https://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xls
##' Life Expectancy - https://spreadsheets.google.com/pub?key=phAwcNAVuyj2tPLxKvvnNPA&output=xls
##' GDP per Capita - https://spreadsheets.google.com/pub?key=phAwcNAVuyj1jiMAkmq1iMg&output=xls
##' Democracy Score - https://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCdGQ2YlhDSWVIdXdpMmhLY2ZZRHdNNnc&output=xls
##' Fertility Rate - https://spreadsheets.google.com/pub?key=phAwcNAVuyj0TAlJeCEzcGQ&output=xls

##' After directing to those pages you can extract the actual Google Sheets key as I did below

#' Read in raw data
pop <- gs_key("1IbDM8z5XicMIXgr93FPwjgwoTTKMuyLfzU6cQrGZzH8") %>% gs_read()
le <- gs_key("1H3nzTwbn8z4lJ5gJ_WfDgCeGEXK3PVGcNjQ_U5og8eo") %>% gs_read()
gdp <- gs_key("1PybxH399kK6OjJI4T2M33UsLqgutwj3SuYbk7Yt6sxE") %>% gs_read()
dem <- gs_key("1MgJAijU4I4WnB8JDu6uPmS9lGYaPUkCt1k-RYTZ4nSE") %>% gs_read()
fert <- gs_key("1oq3r8W7ajenKFgoAYoOf2MXeTWWNPpudR-Fo5m2-o30") %>% gs_read()

#' Different numbers of observations
nrow(dem) # 275
nrow(fert) # 260
nrow(gdp) #275
nrow(le) #260
nrow(pop) #275

#' Change names of first column to be `country`
names(dem)[1] <- "country"
names(fert)[1] <- "country"
names(gdp)[1] <- "country"
names(le)[1] <- "country"
names(pop)[1] <- "country"

#' Convert to factors to look for overlaps between datasets
dem <- dem %>% mutate(country = factor(country))
fert <- fert %>% mutate(country = factor(country))
gdp <- gdp %>% mutate(country = factor(country))
le <- le %>% mutate(country = factor(country))
pop <- pop %>% mutate(country = factor(country))

#' From Jenny Bryan (https://github.com/jennybc/gapminder/blob/master/data-raw/04_merge-pop-lifeExp-gdpPercap.R)
country_levels <- function(df) levels(df$country)
country_subs <- c("Bahamas, The" = "Bahamas",
                  "Central African Rep." = "Central African Republic",
                  "Cook Is" = "Cook Islands",
                  "Czech Rep." = "Czech Republic",
                  "Dominican Rep." = "Dominican Republic",
                  "Egypt, Arab Rep." = "Egypt",
                  "Gambia, The" = "Gambia",
                  "Iran, Islamic Rep." = "Iran",
                  "Russian Federation" = "Russia",
                  "Syrian Arab Republic" = "Syria",
                  "Venezuela, RB" = "Venezuela")
revalue_country <- function(x) plyr::revalue(x, country_subs)
dem <- dem %>%
  mutate(country = revalue_country(country))
fert <- fert %>%
  mutate(country = revalue_country(country))
pop <- pop %>%
  mutate(country = revalue_country(country))
le <- le %>%
  mutate(country = revalue_country(country))
gdp <- gdp %>%
  mutate(country = revalue_country(country))

union_country <- country_levels(dem) %>%
  union(country_levels(fert)) %>%
  union(country_levels(gdp)) %>%
  union(country_levels(le)) %>%
  union(country_levels(pop)) %>%
  sort()

c_dat <- data_frame(country = union_country,
                    dem = country %in% levels(dem$country),
                    fert = country %in% levels(fert$country),
                    pop = country %in% levels(pop$country),
                    le = country %in% levels(le$country),
                    gdp = country %in% levels(gdp$country),
                    total = dem + fert + pop + le + gdp)
c_dat %>% count(total)

#' Population only recorded every year after 1950
#' Tidy data
dem_df <- dem %>% gather(key = "year", value = "dem_score", -country) %>%
  filter(year > 1950)
fert_df <- fert %>% gather(key = "year", value = "fert_rate", -country) %>%
  filter(year > 1950)
pop_df <- pop %>% gather(key = "year", value = "population", -country) %>%
  filter(year > 1950)
le_df <- le %>% gather(key = "year", value = "life_exp", -country) %>%
  filter(year > 1950)
gdp_df <- gdp %>% gather(key = "year", value = "gdp_per_cap", -country) %>%
  filter(year > 1950)

#' Export to RDS for potential workshop work
write_rds(dem_df, "data/dem_df.RDS")
write_rds(fert_df, "data/fert_df.RDS")
write_rds(pop_df, "data/pop_df.RDS")
write_rds(le_df, "data/le_df.RDS")
write_rds(gdp_df, "data/gdp_df.RDS")

#' Join data sets
gap_df <- pop_df %>%
  inner_join(dem_df, by = c("country", "year")) %>%
  inner_join(fert_df, by = c("country", "year")) %>%
  inner_join(le_df, by = c("country", "year")) %>%
  inner_join(gdp_df, by = c("country", "year")) %>%
  droplevels() %>%
  arrange(country, year) %>%
  mutate(missing = rowSums(is.na(.))) %>%
  filter(missing < 3) %>%
  select(-missing) %>% 
  # Only one entry
  filter(country != "Kosovo")

#' Get sub-region and region
countries <- read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
countries$name[countries$name == "Bolivia (Plurinational State of)"] <- "Bolivia"
countries$name[countries$name == "Congo"] <- "Congo, Rep."
countries$name[countries$name == "Congo (Democratic Republic of the)"] <- "Congo, Dem. Rep."
countries$name[countries$name == "Hong Kong"] <- "Hong Kong, China"
countries$name[countries$name == "Côte d'Ivoire"] <- "Cote d'Ivoire"
countries$name[countries$name == "Iran (Islamic Republic of)"] <- "Iran"
countries$name[countries$name == "Korea (Democratic People's Republic of)"] <- "Korea, Dem. Rep."
countries$name[countries$name == "Korea (Republic of)"] <- "Korea, Rep."
countries$name[countries$name == "Réunion"] <- "Reunion"
countries$name[countries$name == "Slovakia"] <- "Slovak Republic"
countries$name[countries$name == "Syrian Arab Republic"] <- "Syria"
countries$name[countries$name == "Taiwan, Province of China"] <- "Taiwan"
countries$name[countries$name == "Tanzania, United Republic of"] <- "Tanzania"
countries$name[countries$name == "United Kingdom of Great Britain and Northern Ireland"] <- "United Kingdom"
countries$name[countries$name == "United States of America"] <- "United States"
countries$name[countries$name == "Venezuela (Bolivarian Republic of)"] <- "Venezuela"
countries$name[countries$name == "Viet Nam"] <- "Vietnam"
countries$name[countries$name == "Yemen"] <- "Yemen, Rep."
countries$name[countries$name == "Russian Federation"] <- "Russia"
countries$name[countries$name == "Macedonia (the former Yugoslav Republic of)"] <- "Macedonia, FYR"
countries$name[countries$name == "Micronesia (Federated States of)"] <- "Micronesia, Fed. Sts."
countries$name[countries$name == "Moldova (Republic of)"] <- "Moldova"
countries <- tbl_df(countries) %>%
  select(name, region, `sub-region`)

gap <- gap_df %>%
  left_join(countries, by = c("country" = "name")) %>%
  rename(sub_region = `sub-region`) %>%
  mutate(sub_region = ifelse(country == "West Bank and Gaza", "Western Asia", sub_region)) %>%
  mutate(region = ifelse(country == "West Bank and Gaza", "Asia", region)) %>%
  mutate(sub_region = ifelse(country == "Brunei", "South-Eastern Asia", sub_region)) %>%
  mutate(region = ifelse(country == "Brunei", "Asia", region)) %>%
  mutate(sub_region = ifelse(country == "Cape Verde", "Western Africa", sub_region)) %>%
  mutate(region = ifelse(country == "Cape Verde", "Africa", region)) %>% 
  mutate(sub_region = ifelse(country == "Channel Islands", "Western Europe", sub_region)) %>%
  mutate(region = ifelse(country == "Channel Islands", "Europe", region)) %>% 
  mutate(sub_region = ifelse(country == "Macao, China", "Eastern Asia", sub_region)) %>%
  mutate(region = ifelse(country == "Macao, China", "Asia", region)) %>% 
  mutate(sub_region = ifelse(country == "Netherlands Antilles", "Caribbean", sub_region)) %>%
  mutate(region = ifelse(country == "Netherlands Antilles", "Americas", region)) 

#' Only keep countries with all 61 data points 
#' (Andorra, Bermuda, Dominica, and Marshall Islands only have 22)
#' From Jenny Bryan (https://github.com/jennybc/gapminder/blob/master/data-raw/08_filter-every-five-years.md)
country_freq <- gap %>% count(country)

gap <- country_freq %>% 
  filter(n == 61) %>% 
  left_join(gap, by = "country") %>% 
  select(-n) %>% 
  droplevels() %>%
  arrange(country, year)

#' Export to RDS for workshop work
write_rds(gap, "data/gap.RDS")