--- title: "Gender in Comic Books" author: "Jonathan Bouchet" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: df_print: kable vignette: | %\VignetteIndexEntry{Comic gender analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette is based on [538 study : Comic Books Are Still Made By Men, For Men And About Men](https://fivethirtyeight.com/features/women-in-comic-books/) study about Marvel and DC characters since ~1939 until 2014 (August 24th) and aims at investigating the features of Comic Books characters according their gender / Publisher. ```{r, message = FALSE, warning = FALSE} #load packages and csv file library(fivethirtyeight) library(ggplot2) library(dplyr) library(readr) library(tidyr) library(lubridate) library(janitor) library(knitr) library(grid) library(fmsb) library(wordcloud) library(gridExtra) ``` #### Overview plots **Note as of 2020-07-18:** The entire `comic_charachters` dataset is available in the `fivethirtyeightdata` package. Instructions to download the `fivethirtyeightdata` package are given when you load the `fivethirtyeight` package. Alternately, you can run the following: ```{r eval=FALSE} install.packages(fivethirtyeightdata, repos = "https://fivethirtyeightdata.github.io/drat/", type = "source" ) ``` Load full dataset using code in `?comic_characters` help file. Note we need to do this since `fivethirtyeight::comic_characters` only contains a preview of the first 10 rows of the full dataset. ```{r, warning = FALSE, message = FALSE} # Get DC characters: comic_characters_dc <- "https://github.com/fivethirtyeight/data/raw/master/comic-characters/dc-wikia-data.csv" %>% read_csv() %>% clean_names() %>% mutate(publisher = "DC") # Get Marvel characters: comic_characters_marvel <- "https://github.com/fivethirtyeight/data/raw/master/comic-characters/marvel-wikia-data.csv" %>% read_csv() %>% clean_names() %>% mutate(publisher = "Marvel") # Merge two dataset and perform further data wrangling: comic_characters <- comic_characters_dc %>% bind_rows(comic_characters_marvel) %>% separate(first_appearance, c("year2", "month"), ", ", remove = FALSE) %>% mutate( # If month was missing, set as January and day as 01: month = ifelse(is.na(month), "01", month), day = "01", # Note some years missing: date = ymd(paste(year, month, day, sep = "-")), align = factor( align, levels = c("Bad Characters", "Reformed Criminals", "Netural Characters", "Good Characters"), ordered = TRUE) ) %>% select(publisher, everything(), -c(year2, day)) ``` #### Overview plots * percentage of Gender per publisher. * raw number of characters per publisher. ```{r, warning = FALSE, message = FALSE, fig.width = 10} #calculate raw number and percentage of each gender for each publisher raw_number_per_publisher <- comic_characters %>% group_by(publisher) %>% summarise(number = n()) %>% arrange(-number) percent_gen_pub <- comic_characters %>% group_by(sex, publisher) %>% summarise(number = n()) %>% arrange(-number) %>% group_by(publisher) %>% mutate(countT = sum(number)) %>% group_by(sex) %>% mutate( percentage = (100*number/countT), label = paste0(round(percentage, 2))) #plot percentage of each gender for each publisher percentage_per_publisher <- ggplot(data = percent_gen_pub,aes(x = sex,y = percentage, fill = publisher)) + geom_bar(width = 0.9, stat = "identity", position = 'dodge') + theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position = 'none') + geom_text(aes(label = label), position=position_dodge(width = 0.9), vjust = -0.25,size = 2.5) + scale_fill_manual(values = c("#3B9AB2","#EBCC2A")) + xlab('') + ylab('Percentage') raw_number_per_publisher <- ggplot(data = raw_number_per_publisher, aes(x = publisher, y = number, fill = publisher)) + geom_bar(width = 0.1, stat = "identity") + coord_flip() + scale_fill_manual(values = c("#3B9AB2","#EBCC2A")) + xlab('') + ylab('') + theme(legend.position = 'None') grid.arrange(percentage_per_publisher, raw_number_per_publisher, ncol=2) ``` * Marvel has more than the double of number of characters compared to DC. * `Male` are more present in both publishers compared to `Female` on a ~2.5:1 ratio, while `LGBT` characters represent less than 1 percent. #### Number of Characters vs. Time ```{r, message = FALSE, warning = FALSE, fig.width = 10} #select data with no NA's for sex and date and groupby #define list of gender per publisher gender_list_marvel <- c("Female Characters", "Male Characters", "Genderfluid Characters","Agender Characters") gender_list_dc <- c("Female Characters", "Male Characters", "Genderless Characters","Transgender Characters") marvel_vs_time <- comic_characters %>% filter(publisher == 'Marvel' & !is.na(month) & !is.na(sex)) %>% group_by(year, month, sex) %>% summarise(number = n()) %>% mutate( sex_ordered = factor(sex, levels = gender_list_marvel), month_ordered = factor(month, levels = month.name)) dc_vs_time <- comic_characters %>% filter(publisher == 'DC' & month!= "Holiday" & !is.na(month) & !is.na(sex)) %>% mutate(month = ifelse(month=="01","January",month)) %>% group_by(year, month, sex) %>% summarise(number = n()) %>% mutate( sex_ordered = factor(sex, levels = gender_list_dc), month_ordered = factor(month, levels = month.name)) plot_marvel_time <- ggplot(data = marvel_vs_time, aes(year, month_ordered)) + geom_tile(aes(fill = number),colour = "white") + scale_fill_gradient(low = "#EBCC2A", high = "black") + facet_wrap(~ sex_ordered, ncol = 4) + theme(axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'right', legend.title = element_blank(), legend.key.size = unit(.2, "cm")) + xlim(1935,2015) plot_dc_time <- ggplot(data = dc_vs_time, aes(year, month_ordered)) + geom_tile(aes(fill = number),colour = "white") + scale_fill_gradient(low = "#3B9AB2", high = "black") + facet_wrap(~ sex_ordered, ncol = 4) + theme(axis.title.x = element_blank() ,axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'right', legend.title = element_blank(), legend.key.size = unit(.2, "cm")) + xlim(1935,2015) grid.arrange(rbind(ggplotGrob(plot_marvel_time), ggplotGrob(plot_dc_time), size = "last")) ``` * Most of the characters are either `Female` or `Male` for both publishers. * There is a gap in character's creation (for both publishers) ~1955-1960. * The increase in `Female` characters appear early for Marvel (~1970) while it was in the late 70's for DC comics. * `LGBT` characters appear in the late 70's (DC) and in the late 60's (Marvel). #### Characteristics per Gender, publisher To represent the average characteristics, radarchart will be used. * Characteristics looked at are : `id`,`align`,`eye`,`hair` and `alive`. * the idea is to group the rows by these features and compute their mean * later by taking the higher percentage, we can deduce a `profile` representative of each publisher. ##### example: vs `Hair` feature ```{r, message = FALSE, warning = FALSE} #function to aggregate data by (gender, feature, publisher) aggregateFeature <- function(current_publisher, feature){ #empty list to keep dataframe by (gender, feature) currentFeature <- list() if(current_publisher == 'Marvel'){ gender_list <- gender_list_marvel } else { gender_list <- gender_list_dc } for(i in 1:length(gender_list)){ currentFeature[[i]] <- comic_characters %>% filter(publisher == current_publisher ) %>% select(hair, sex) %>% na.omit() %>% group_by(hair) %>% filter(sex == gender_list[i]) %>% summarise(number = n()) %>% arrange(-number) %>% mutate(countT = sum(number)) %>% mutate(percentage = round(100*number/countT,1)) %>% select(hair, percentage, number) if(current_publisher == 'Marvel'){ colnames(currentFeature[[i]])[2]<-'percentage_marvel' } else{ colnames(currentFeature[[i]])[2]<-'percentage_dc' } if(feature == 'hair'){ #strip 'hair' word for better display of the radarchart currentFeature[[i]]$hair <- sapply(currentFeature[[i]]$hair, function(x) gsub(' Hair','', x))} } names(currentFeature) <- gender_list return(currentFeature) } ``` ```{r, message = FALSE, warning = FALSE} #outer join the 2 dataframes for (feature=Hair, gender=Male) merged <- full_join(aggregateFeature('DC','hair')[[2]], aggregateFeature('Marvel','hair')[[2]], by='hair') #set min/max percentages for the radarchart limits min <- rep(0, length(merged$hair)) max <- rep(50, length(merged$hair)) maleHair <- data.frame(rbind(max,min,merged$percentage_dc,merged$percentage_marvel)) colnames(maleHair) <- merged$hair row.names(maleHair) <- c('max','min','percentage_dc','percentage_marvel') maleHair[is.na(maleHair)] <- 0 ``` ```{r, message = FALSE, warning = FALSE} #cosmetics radarchart(maleHair, #dataframe axistype=2, #axis type pcol=c("#3B9AB2", "#EBCC2A"), #color -Wes Anderson Sissou- plwd=1, #axis line width pty=19, #marker type plty=3, #line type cglcol="grey", #axis line color cglty=2, #axis line type axislabcol="grey", #Color of axis label cglwd=.6, #line type vlcex=.6, #font size label palcex=1.) #font size value legend(x=1, y=1.3, #position legend = c('DC male characters','Marvel male characters'),#labels bty="n", #no window pch=16, #marker type text.col = "black", #label colors col=c("#3B9AB2","#EBCC2A"), #marker color cex=.8, #marker size pt.cex=1) #font size ``` #### Popularity By looking at the number of appearances for each characters, we can make a visualization (`wordcloud`) representing the most popular characters. ```{r, message = FALSE, warning = FALSE, fig.width = 10} #remove NA from appearance column, keep only the name inside parentheses marvel_appearances <- comic_characters %>% filter(publisher == 'Marvel') %>% select(name, appearances) %>% na.omit() %>% mutate(name = gsub(" *\\(.*?\\) *", "", name)) dc_appearances <- comic_characters %>% filter(publisher == 'DC') %>% select(name, appearances) %>% na.omit() %>% mutate(name = gsub(" *\\(.*?\\) *", "", name)) color_marvel <- colorRampPalette(c("#EBCC2A", "black")) color_dc <- colorRampPalette(c("#3B9AB2", "black")) op <- par(mar = c(1, 2, 2, 1), mfrow = c(1, 2)) wordcloud(dc_appearances$name, dc_appearances$appearances, min.freq = 100, colors = color_dc(10), scale = c(1.75, 0.2)) wordcloud(marvel_appearances$name, marvel_appearances$appearances, min.freq = 250, colors = color_marvel(10), scale = c(1.5, 0.2)) ``` * `Batman`(DC) and `Spider-Man`(Marvel) ends up being the characters with the most appearances.