---
title: "Machine Learning with R"
output: html_notebook
---
## Setup
```{r package imports, message=FALSE}
library(dplyr) # data manipulation
library(ggplot2) # data visualisation
library(readr) # data import
library(stringr) # text processing
library(tidyr) # data transformation
```
```{r theming}
theme_set(theme_minimal())
```
```{r parameters}
BASE_URL <- paste0('https://raw.githubusercontent.com/warwickdatascience/',
'helping-hack-heart-of-england/main/resources/')
```
## Exploratory Analysis
Data available from the [hackathon website](https://warwickdatascience.github.io/helping-hack-heart-of-england/).
```{r data import, message=FALSE}
imd <- read_csv(paste0(BASE_URL, 'imd.csv'))
ref <- read_csv(paste0(BASE_URL, 'ref.csv'))
```
```{r sample datasets}
sample_n(imd, 10)
sample_n(ref, 10)
```
```{r change in imd}
imd %>%
filter(!is.na(imd_2019)) %>%
mutate(imd_change = imd_2019 - imd_2015) %>%
ggplot(aes(x = imd_change)) +
geom_histogram(bins = 25, col = 'black', fill = 'lightblue') +
labs(x = "Change in IMD", y = "Count")
```
```{r data aggregation}
ref_agg <- ref %>%
group_by(lad_code, lad_name, class, category) %>%
# Note: 2017 Devon & Cornwall Police and Crime data is missing
summarise(expenditure = mean(expenditure, na.rm = TRUE), .groups = 'drop')
sample_n(ref_agg, 10)
```
A massive flaw is already obvious: by not including county/London boroughs/other authorities, we are not accurately reflecting spending. We will not address this issue here as this is one way you can approve your leaderboard score.
```{r expenditure by category, fig.asp=2}
ref_agg %>%
filter(expenditure > 0) %>%
ggplot(aes(x = log(expenditure))) +
geom_histogram(bins = 25, col = 'black', fill = 'lightblue') +
labs(x = "Expenditure", y = "Count") +
facet_wrap(~ category, ncol = 2)
```
```{r expenditure by category and class, fig.asp=0.5}
ref_agg %>%
mutate(expenditure_sign = case_when(
near(expenditure, 0) ~ 'Zero',
expenditure < 0 ~ 'Negative',
expenditure > 0 ~ 'Postive'
), class_type = case_when(
class %in% c('L', 'UA', 'SD') ~ 'Individual',
class %in% c('MD', 'SC') ~ 'Combined',
TRUE ~ 'Other'
)) %>%
ggplot(aes(x = category, fill = expenditure_sign)) +
geom_bar(aes()) +
labs(x = "Expenditure Category", y = "Proportion", fill = "Expenditure Sign") +
facet_wrap(~ class_type, scales = 'free_x') +
coord_flip() +
theme(axis.text.x = element_blank())
```
```{r correlations, fig.asp=2}
imd %>%
filter(!is.na(imd_2019)) %>%
mutate(imd_change = imd_2019 - imd_2015) %>%
select(lad_code, imd_change) %>%
inner_join(select(ref_agg, lad_code, category, expenditure) %>%
filter(expenditure > 0),
by = 'lad_code') %>%
ggplot(aes(x = log(expenditure), y = imd_change)) +
geom_point() +
geom_smooth(method = 'lm', formula = y ~ x) +
facet_wrap(~ category, scales = 'free', ncol = 2) +
labs(x = "Expenditure", y = "Change in IMD")
```
## Modelling
```{r transform data}
ref_wide <- ref_agg %>%
mutate(category = str_remove_all(
str_replace_all(category, ' ', '_'), '[^\\w ]+'
)) %>%
spread(key = category, value = expenditure)
sample_n(ref_wide, 10)
```
```{r combine datasets}
combi <- imd %>%
select(lad_code, imd_2015, imd_2019) %>%
left_join(select(ref_wide, -c(lad_name, class)), by = 'lad_code')
```
```{r build null model}
# Null model on leaderboard
null <- lm(imd_2019 ~ offset(imd_2015),
data = filter(combi, !is.na(imd_2019)))
```
```{r build model}
# Baseline model on leaderboard
mod <- lm(formula(paste0('imd_2019 ~ offset(imd_2015) + ',
paste(colnames(combi)[-(1:5)], collapse = ' + '))),
data = filter(combi, !is.na(imd_2019)))
```
```{r compare models}
anova(null, mod)
```
## Output
```{r make predictions}
pred <- predict(mod, newdata = filter(combi, is.na(imd_2019)))
```
```{r output predictions}
imd %>%
filter(is.na(imd_2019)) %>%
mutate(imd_2019 = pred) %>%
write_csv('../resources/output/tutorial.csv')
```
## Comments
Possible improvements:
- Use lookup tables to bring in all data sources
- Consider using multi-level modeling techniques
- Implement cross-validation to ensure generalisation
- Use regularisation to reduce generalisation gap
- Consider more complex models
- Model using a transformed expenditure
- Use time series modelling