---
title: "The Tate Collection"
subtitle: "MakeoverMonday challenge for week 24 2017"
date: "June 11, 2017"
author: "jake riley"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, warning = F, message = F, cache = T)
```

###This week I chose to look at the metadata for the Tate Collection. Read about the Tate Collection [here ](http://www.tate.org.uk/about/our-work/collection/about-the-collection).

####Find more information about this project:
* [MakeoverMonday](http://www.makeovermonday.co.uk/data/)
* [Original Data](https://github.com/tategallery/collection)
* [Code on GitHub/rjake](https://raw.githubusercontent.com/rjake/makeoverMonday/master/2017_24_tate_collection.Rmd)

```{r setup_workspace, echo = T}
library(data.table)
library(knitr)
library(stringi)
library(tidyverse)
library(wordcloud2)

options(scipen = 999)
```

```{r read_data}
art_raw <-
  "https://raw.githubusercontent.com/tategallery/collection/master/artwork_data.csv"  %>% 
  fread(., select= c(1,5,8,10,11), 
          na.strings = "", data.table = F, encoding = "UTF-8")

artist_raw <-
  "https://raw.githubusercontent.com/tategallery/collection/master/artist_data.csv" %>% 
  fread(., select = c(1,2,3,5:7), na.strings = "", data.table = F, encoding = "UTF-8") %>% 
  rename(artistId = id) %>% 
  mutate(name = stri_trans_general(name, "latin-ascii"),
         placeOfBirth = stri_trans_general(placeOfBirth, "latin-ascii"))

rename_places <-
  "https://raw.githubusercontent.com/rjake/makeoverMonday/master/2017_24_places_rename.csv" %>% 
  fread(., data.table = F, encoding = "UTF-8")

```

```{r}
#Get rid of missing data, remove medium values that don't start with letters
#create new columns (not all are used)
#isolate the 1st word from the medium column
#place names use older titles (ex: Misr = Egypt), I use a reference file to update the names
art_plot <-
  art_raw %>% 
  left_join(artist_raw) %>% 
  filter(!is.na(year),
         year != "no date",
         !is.na(yearOfBirth),
         !is.na(yearOfDeath),
         !is.na(acquisitionYear),
         !is.na(gender),
         grepl("^[A-Za-z]", medium)) %>% 
  mutate(year = as.integer(year)) %>% 
  mutate(artist_age_created = paste0(round(year - yearOfBirth, -1), "s"),
         art_age_acquire = (acquisitionYear - year)%>%
                           ifelse(. < 0, 0, .) %>% 
                           cut(., breaks = seq(0, 425, 25),
                               include.lowest = T),
         post_humus = ifelse(acquisitionYear > yearOfDeath, 
                             "post-humus", "while living"),
         decade_born = round(yearOfBirth, -1),
         decade_died = round(yearOfDeath, -1),
         decade_created = paste0(round(year, -1), "s"),
         acquisition_decade = paste0(round(acquisitionYear, -1), "s")) %>% 
  select(-c(yearOfBirth, yearOfDeath, id, artistId)) %>% 
  mutate(medium = stri_extract(medium, regex = '\\w*'),
         last_name = gsub(", .*", "", name),
         last_name = ifelse(nchar(last_name) > 20,
                       gsub(" \\(.*", "", last_name), last_name)) %>%
  mutate(placeOfBirth = gsub(".*, ", "", placeOfBirth)) %>% 
  left_join(rename_places) %>% 
  mutate(placeOfBirth = Rename) %>% 
  select(-Rename, -name)
```


```{r}
range_new <- 
  function(x, newMin, newMax){
    (x - min(x))/(max(x)-min(x)) * (newMax - newMin) + newMin 
  }

get_columns <-   
  art_plot %>% 
  select(medium, last_name, placeOfBirth, decade_created) %>% 
  colnames()


word_plot <-
  art_plot %>% 
  select(one_of(get_columns)) %>% 
  gather(key="Var", value = "Response", na.rm = T) %>%
  group_by(Var, Response) %>% 
  summarise(N = n()) %>% 
  group_by(Var) %>% 
  arrange(-N) %>% 
  mutate(color_V = row_number()*2) %>% 
  filter(color_V < 100) %>% 
  ungroup() %>% 
  arrange(Var) %>%
  rowwise() %>% 
  mutate(color_H = which(get_columns == Var) * .22) %>% 
  ungroup() %>% 
  #rowwise() %>% 
  mutate(color = hsv(color_H, .5, color_V/100)) %>% 
  filter(N > 1) %>% 
  arrange(desc(N)) %>% 
  mutate(logN = log(N),
         rangeN = range_new(N, 2, 30))
```


```{r eval = F}
print_wordcloud <-
  function(x){
    #x = 1
    var_to_use <- get_columns[x]
    
    word_plot_prep <-
      word_plot %>% 
      filter(Var == var_to_use) %>%
      slice(1:100) %>% 
      select(Response, N = rangeN, color) 
  
    row.names(word_plot_prep) <- word_plot_prep$Response
    
    colorVec = grey(seq(0.1, .7, length.out = nrow(word_plot_prep)))
    
    word_cloud_save <- 
      wordcloud2(word_plot_prep[, 1:2],
               maxRotation = 0, minRotation = 0, 
               color = colorVec,
               minSize = 12)

    word_cloud_html <- paste0(i, ".html")
    word_cloud_png <- paste0(i, ".png")  
    
    
    my_path  <- htmltools::html_print(word_cloud_save) # saves html in temp directory
    print(my_path)
}
```

##```r get_columns[1]```
```{r eval = F}
print_wordcloud(1)
```
![](1.png)
<br>

##```r get_columns[2]```
```{r eval = F}
print_wordcloud(2)
```
![](2.png)
<br>

##```r get_columns[3]```
```{r eval = F}
print_wordcloud(3)
```
![](3.png)
<br>

##```r get_columns[4]```
```{r eval = F}
print_wordcloud(4)
```
![](4.png)

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

#Final Product
After exporting each png through RStudio Viewer > Export, I combined the images together.

##The Tate Collection
![](1_4.png)

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>


I was also interested in the number of works by women in the Tate Collection. It looks like the collection of female artists started to grow around the early 1900s and most of the work is from that point forward. 


```{r fig.width = 6, fig.height = 8}
  ggplot(art_plot) +
    facet_grid(gender~.) +
    geom_count(aes(y = year, 
                   x = acquisitionYear,
                   color = year >= 1900
                   ),
               alpha = .25) +
    scale_color_manual(values = c("grey60", "orange")) +
    geom_hline(yintercept = 1900) +
    geom_vline(xintercept = 1900, linetype = "dotted") +
    guides(size = F) +
    labs(y = "Year Created", x = "Year Acquired") +
    theme(panel.background = element_rect(fill = "white", color = "black")) +
    coord_fixed(.2) +
  ggtitle("The effort to include more women has focused primarily \non artists from 1900 onwards ")
```


```{r}
a <-
  art_plot %>% 
  mutate(year1900 =
           ifelse(year < 1900, "pre1900", "1900+")) %>% 
  count(gender, year1900) %>% 
  group_by(year1900) %>% 
  mutate(pct = (n/sum(n)) %>% round(., 3) * 100) %>% 
  ungroup() %>% 
  arrange(desc(year1900))

ggplot(a) +
  geom_col(aes(x = year1900, y = n, fill = gender)) +
  coord_flip() +
  labs(x = "year created", y = "# of works") +
  theme(aspect.ratio = .25) +
  ggtitle("The 1127 works of art by women make up less than 3% of the Tate Collection")

art_plot %>% 
  count(gender)%>% 
  mutate(pct = (n/sum(n)) %>% round(., 3) * 100) %>% 
  kable(padding = 0)
```


```{r eval = F}
#not using
  a <- 
    art_plot %>% 
    #filter(acquisitionYear > 1860) %>% 
    mutate(side = year > 1900) %>% 
    count(gender, side, acquisitionYear) %>% 
    mutate(n = ifelse(side == T, n, -n))
  
  ggplot(a, aes(y = n, x = acquisitionYear, fill = side)) +
    facet_grid(.~gender)+#, scales = "free_y") +
    geom_col(data = filter(a, side == T)) +
    geom_col(data = filter(a, side == F)) +
    scale_color_manual(values = c("grey60", "navyblue")) +
    geom_hline(yintercept = 0) +
    geom_vline(xintercept = 1900, linetype = "dotted") +
    guides(size = F) +
    coord_cartesian(ylim = c(-1000,2000))+
    labs(y = "Year Created", x = "Year Acquired") +
    theme(panel.background = element_rect(fill = "white", color = "black"))
```