tidytext
library(janeaustenr)
books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
books
## # A tibble: 73,422 x 4
## text book linenumber chapter
## <chr> <fctr> <int> <int>
## 1 SENSE AND SENSIBILITY Sense & Sensibility 1 0
## 2 Sense & Sensibility 2 0
## 3 by Jane Austen Sense & Sensibility 3 0
## 4 Sense & Sensibility 4 0
## 5 (1811) Sense & Sensibility 5 0
## 6 Sense & Sensibility 6 0
## 7 Sense & Sensibility 7 0
## 8 Sense & Sensibility 8 0
## 9 Sense & Sensibility 9 0
## 10 CHAPTER 1 Sense & Sensibility 10 1
## # ... with 73,412 more rows
tidy_books <- books %>%
unnest_tokens(output = word, input = text)
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fctr> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows
# retrieve state populations in 2014 from Census Bureau
pop_df <- read_csv("http://cfss.uchicago.edu/data/pop2014.csv")
# do these results make sense?
pop_df %>%
arrange(desc(pop2014)) %>%
top_n(10)
## # A tibble: 10 x 2
## state_name pop2014
## <chr> <int>
## 1 california 38066920
## 2 texas 26092033
## 3 new york 19594330
## 4 florida 19361792
## 5 illinois 12868747
## 6 pennsylvania 12758729
## 7 ohio 11560380
## 8 georgia 9907756
## 9 michigan 9889024
## 10 north carolina 9750405
song_lyrics <- read_csv("../data/billboard_lyrics_1964-2015.csv")
str(song_lyrics)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5100 obs. of 6 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Song : chr "wooly bully" "i cant help myself sugar pie honey bunch" "i cant get no satisfaction" "you were on my mind" ...
## $ Artist: chr "sam the sham and the pharaohs" "four tops" "the rolling stones" "we five" ...
## $ Year : int 1965 1965 1965 1965 1965 1965 1965 1965 1965 1965 ...
## $ Lyrics: chr "sam the sham miscellaneous wooly bully wooly bully sam the sham the pharaohs domingo samudio uno dos one two "| __truncated__ "sugar pie honey bunch you know that i love you i cant help myself i love you and nobody elsein and out my life "| __truncated__ NA "when i woke up this morning you were on my mind and you were on my mind i got troubles whoaoh i got worries who"| __truncated__ ...
## $ Source: int 3 1 1 1 1 1 3 5 1 3 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 6
## .. ..$ Rank : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Song : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Artist: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Lyrics: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Source: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
Use
tidytext
to create a data frame with one row for each token in each song
tidytext
to create a data frame with one row for each token in each songtidy_lyrics <- bind_rows(song_lyrics %>%
unnest_tokens(output = state_name,
input = Lyrics),
song_lyrics %>%
unnest_tokens(output = state_name,
input = Lyrics,
token = "ngrams", n = 2))
tidy_lyrics
## # A tibble: 3,201,196 x 6
## Rank Song Artist Year Source
## <int> <chr> <chr> <int> <int>
## 1 1 wooly bully sam the sham and the pharaohs 1965 3
## 2 1 wooly bully sam the sham and the pharaohs 1965 3
## 3 1 wooly bully sam the sham and the pharaohs 1965 3
## 4 1 wooly bully sam the sham and the pharaohs 1965 3
## 5 1 wooly bully sam the sham and the pharaohs 1965 3
## 6 1 wooly bully sam the sham and the pharaohs 1965 3
## 7 1 wooly bully sam the sham and the pharaohs 1965 3
## 8 1 wooly bully sam the sham and the pharaohs 1965 3
## 9 1 wooly bully sam the sham and the pharaohs 1965 3
## 10 1 wooly bully sam the sham and the pharaohs 1965 3
## # ... with 3,201,186 more rows, and 1 more variables: state_name <chr>
Find all the state names occurring in the song lyrics
tidy_lyrics <- inner_join(tidy_lyrics, pop_df) %>%
distinct(Rank, Song, Artist, Year, state_name, .keep_all = TRUE)
tidy_lyrics
## # A tibble: 253 x 7
## Rank Song Artist
## <int> <chr> <chr>
## 1 12 king of the road roger miller
## 2 29 eve of destruction barry mcguire
## 3 49 california girls the beach boys
## 4 10 california dreamin the mamas the papas
## 5 77 message to michael dionne warwick
## 6 61 california nights lesley gore
## 7 4 sittin on the dock of the bay otis redding
## 8 10 tighten up archie bell the drells
## 9 25 get back the beatles with billy preston
## 10 25 get back the beatles with billy preston
## # ... with 243 more rows, and 4 more variables: Year <int>, Source <int>,
## # state_name <chr>, pop2014 <int>
Calculate the frequency for each state’s mention in a song and create a new column for the frequency adjusted by the state’s population
(state_counts <- tidy_lyrics %>%
count(state_name) %>%
arrange(desc(n)))
## # A tibble: 33 x 2
## state_name n
## <chr> <int>
## 1 new york 64
## 2 california 34
## 3 georgia 22
## 4 tennessee 14
## 5 texas 14
## 6 alabama 12
## 7 mississippi 10
## 8 kentucky 7
## 9 hawaii 6
## 10 illinois 6
## # ... with 23 more rows
pop_df <- pop_df %>%
left_join(state_counts) %>%
mutate(rate = n / pop2014 * 1e6)
pop_df %>%
arrange(desc(rate)) %>%
top_n(10)
## # A tibble: 10 x 4
## state_name pop2014 n rate
## <chr> <int> <int> <dbl>
## 1 hawaii 1392704 6 4.308166
## 2 mississippi 2984345 10 3.350819
## 3 new york 19594330 64 3.266251
## 4 alabama 4817678 12 2.490826
## 5 maine 1328535 3 2.258126
## 6 georgia 9907756 22 2.220483
## 7 tennessee 6451365 14 2.170083
## 8 montana 1006370 2 1.987341
## 9 nebraska 1855617 3 1.616713
## 10 kentucky 4383272 7 1.596981
library(statebins)
pop_df %>%
mutate(state_name = stringr::str_to_title(state_name),
state_name = if_else(state_name == "District Of Columbia",
"District of Columbia", state_name)) %>%
statebins_continuous(state_col = "state_name", value_col = "n") +
labs(title = "Frequency of states mentioned in song lyrics",
subtitle = "Number of mentions") +
theme(legend.position = "bottom")
pop_df %>%
mutate(state_name = stringr::str_to_title(state_name),
state_name = if_else(state_name == "District Of Columbia",
"District of Columbia", state_name)) %>%
statebins_continuous(state_col = "state_name", value_col = "rate") +
labs(title = "Frequency of states mentioned in song lyrics",
subtitle = "Number of mentions per capita") +
theme(legend.position = "bottom")
I am happy
get_sentiments("bing")
## # A tibble: 6,788 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
## 7 abominably negative
## 8 abominate negative
## 9 abomination negative
## 10 abort negative
## # ... with 6,778 more rows
get_sentiments("afinn")
## # A tibble: 2,476 x 2
## word score
## <chr> <int>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,466 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
get_sentiments("nrc") %>%
count(sentiment)
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231
tidy_books %>%
inner_join(get_sentiments("bing"))
## # A tibble: 52,287 x 5
## book linenumber chapter word sentiment
## <fctr> <int> <int> <chr> <chr>
## 1 Sense & Sensibility 16 1 respectable positive
## 2 Sense & Sensibility 16 1 good positive
## 3 Sense & Sensibility 18 1 advanced positive
## 4 Sense & Sensibility 20 1 death negative
## 5 Sense & Sensibility 20 1 great positive
## 6 Sense & Sensibility 21 1 loss negative
## 7 Sense & Sensibility 25 1 comfortably positive
## 8 Sense & Sensibility 28 1 goodness positive
## 9 Sense & Sensibility 28 1 solid positive
## 10 Sense & Sensibility 29 1 comfort positive
## # ... with 52,277 more rows
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment)
## # A tibble: 1,840 x 4
## book index sentiment n
## <fctr> <dbl> <chr> <int>
## 1 Sense & Sensibility 0 negative 16
## 2 Sense & Sensibility 0 positive 32
## 3 Sense & Sensibility 1 negative 19
## 4 Sense & Sensibility 1 positive 53
## 5 Sense & Sensibility 2 negative 12
## 6 Sense & Sensibility 2 positive 31
## 7 Sense & Sensibility 3 negative 15
## 8 Sense & Sensibility 3 positive 31
## 9 Sense & Sensibility 4 negative 16
## 10 Sense & Sensibility 4 positive 34
## # ... with 1,830 more rows
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0)
## # A tibble: 920 x 4
## book index negative positive
## * <fctr> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility 0 16 32
## 2 Sense & Sensibility 1 19 53
## 3 Sense & Sensibility 2 12 31
## 4 Sense & Sensibility 3 15 31
## 5 Sense & Sensibility 4 16 34
## 6 Sense & Sensibility 5 16 51
## 7 Sense & Sensibility 6 24 40
## 8 Sense & Sensibility 7 23 51
## 9 Sense & Sensibility 8 30 40
## 10 Sense & Sensibility 9 15 19
## # ... with 910 more rows
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## # A tibble: 920 x 5
## book index negative positive sentiment
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
## 7 Sense & Sensibility 6 24 40 16
## 8 Sense & Sensibility 7 23 51 28
## 9 Sense & Sensibility 8 30 40 10
## 10 Sense & Sensibility 9 15 19 4
## # ... with 910 more rows
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
# plot the sentiment over time in each book
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
facet_wrap( ~ book, ncol = 2, scales = "free_x")
## # A tibble: 1,089,386 x 3
## # Groups: book [7]
## book chapter word
## <fctr> <int> <chr>
## 1 philosophers_stone 1 the
## 2 philosophers_stone 1 boy
## 3 philosophers_stone 1 who
## 4 philosophers_stone 1 lived
## 5 philosophers_stone 1 mr
## 6 philosophers_stone 1 and
## 7 philosophers_stone 1 mrs
## 8 philosophers_stone 1 dursley
## 9 philosophers_stone 1 of
## 10 philosophers_stone 1 number
## # ... with 1,089,376 more rows