##### Load R Packages library(tidyverse) library(readxl) library(reshape2) ##### Read in data files # See for an overview https://www.gapminder.org/fw/world-health-chart/whc2017/ # Source: https://www.gapminder.org/data/geo/ countries <- read_xlsx('datasets/gapminder/Data Geographies - v1 - by Gapminder.xlsx', sheet = 2) # Source: https://www.gapminder.org/data/documentation/gd004/ life_expectancy <- read_xlsx('datasets/gapminder/lex-by-gapminder.xlsx', sheet = 2) # Source: https://www.gapminder.org/data/documentation/gd003/ population <- read_xlsx('datasets/gapminder/Data Population - v5 - 1800 to 2100 World Regions and Countries by Gapminder.xlsx', sheet = 'data-countries-etc-by-year-colu') # Source: https://www.gapminder.org/data/documentation/gd001/ income <- read_xlsx('datasets/gapminder/gdppc_cppp-by-gapminder.xlsx', sheet = 2) ##### Data exploration names(countries) names(life_expectancy) names(income) names(population) ##### Data preparation # Because life_expectancy and income have overlapping column names, we need # to rename them so we can identify the columns in the merged dataset names(life_expectancy)[5:ncol(life_expectancy)] <- paste0('le_', names(life_expectancy)[5:ncol(life_expectancy)]) names(income)[5:ncol(income)] <- paste0('income_', names(income)[5:ncol(income)]) names(population)[4:ncol(population)] <- substr(names(population)[4:ncol(population)], 1, 4) # names(population)[4:ncol(population)] <- as.character(as.integer(names(population)[4:ncol(population)])) names(population)[4:ncol(population)] <- paste0('pop_', names(population)[4:ncol(population)]) gapminder <- merge(countries, life_expectancy, by = 'geo', all.x = TRUE) gapminder <- merge(gapminder, income, by = 'geo', all.x = TRUE) gapminder <- merge(gapminder, population, by = 'geo', all.x = TRUE) names(gapminder) # We'll create a variable for the year we want to look at. As we will # see later, we can change this variable and create the figure for a different # year quickly. year <- '2017' # Subset the gapminder data.frame to include only the columns we are interested in gapminder2 <- gapminder %>% rename(name = name.x) %>% select(geo, name, four_regions, paste0('income_', year), paste0('pop_', year), paste0('le_', year)) %>% rename(income = paste0('income_', year), population = paste0('pop_', year), life_expectancy = paste0('le_', year)) gapminder2 %>% filter(geo == 'usa') # https://www.gapminder.org/topics/four-income-levels/ income_levels <- c(2 * 365, 8 * 365, 32 * 365) income_levels_df <- data.frame( position = c(0, income_levels), label = c('Level 1', 'Level 2', 'Level 3', 'Level 4') ) ##### Build the graphic ggplot(gapminder2, aes(x = income, y = life_expectancy, color = four_regions, size = population)) + geom_vline(data=income_levels_df, aes(xintercept = position)) + geom_text(data=income_levels_df, aes(x=position, label=label), y = 50, color='black', size=4, hjust = -0.1) + geom_point() + scale_x_log10() + theme_light()