#' CSSS 508, Week 10
#' ===
#' author: Rebecca Ferrell
#' date: June 1, 2016
#' transition: rotate
#' width: 1100
#' height: 750
#' 
#' 
#' 
#' Topics
#' ===
#' 
#' * Scraping the web with `rvest`
#' * Mining text with `tm`
#' * What next?
#' 
#' 
#' Web scraping with rvest
#' ===
#' type: section
#' 
#' 
#' Wait, isn't that Argus Filch?
#' ===
#' 
#' ![harry potter and game of thrones](https://pbs.twimg.com/media/Bt4B0NAIYAAbGMu.jpg)
#' 
#' 
#' Game of Thrones x Harry Potter
#' ===
#' 
#' We'll use the package `rvest` ("harvest") to grab [IMDb](http://www.imdb.com) casts for Game of Thrones and Harry Potter to identify all overlapping actors.
#' 
## ----warning=FALSE, message=FALSE----------------------------------------
# install.packages("rvest")
library(rvest)

#' 
#' First, try out [SelectorGadget](https://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html).
## ----pull_got, cache=TRUE------------------------------------------------
# pull full Game of Thrones cast page
got_page <- read_html("http://www.imdb.com/title/tt0944947/fullcredits")
got_cast_raw <- got_page %>%
    html_nodes(".itemprop .itemprop , .character div") %>%
    html_text()

#' 
#' 
#' Cleaning up the Game of Thrones cast
#' ===
#' 
## ------------------------------------------------------------------------
head(got_cast_raw)

#' 
#' Pattern appears to be: [actor name], [messy character info], repeat.
#' 
#' 
#' Cleaning up the Game of Thrones cast
#' ===
#' 
#' Make a data frame:
#' 
## ------------------------------------------------------------------------
got_cast_df <- data.frame(matrix(got_cast_raw,
                                 ncol = 2,
                                 byrow = TRUE),
                          stringsAsFactors = FALSE)
colnames(got_cast_df) <- c("Actor", "char_info")
head(got_cast_df, 3)

#' 
#' 
#' Clean up the character column
#' ===
#' 
#' We want to trim initial whitespace, extract the character, and move the episode count to a new column.
#' 
## ----warning=FALSE, message=FALSE----------------------------------------
library(stringr); library(dplyr)
got_cast <- got_cast_df %>%
    mutate(char_info = str_trim(char_info),
           GoT_character = str_trim(str_extract(char_info, "^.*\\n")),
           Episodes = as.numeric(
               str_extract(str_extract(char_info, "[0-9]* episode"),
                                             "[0-9]*"))
           ) %>%
    select(-char_info)

#' 
#' 
#' Scraping Harry Potter actors
#' ===
#' 
#' We'll want to loop over all eight films to do this!
#' 
## ------------------------------------------------------------------------
HP_URLs <- c("http://www.imdb.com/title/tt0241527/fullcredits",
             "http://www.imdb.com/title/tt0295297/fullcredits",
             "http://www.imdb.com/title/tt0304141/fullcredits",
             "http://www.imdb.com/title/tt0330373/fullcredits",
             "http://www.imdb.com/title/tt0373889/fullcredits",
             "http://www.imdb.com/title/tt0417741/fullcredits",
             "http://www.imdb.com/title/tt0926084/fullcredits",
             "http://www.imdb.com/title/tt1201607/fullcredits")

#' 
#' Harry Potter scraping
#' ===
#' 
#' Looping game plan:
#' 
#' 1. Create a list with a spot for each film
#' 2. Scrape the cast into the spot for each film
#' 3. Reshape the character vector into a matrix
#' 4. Combine the casts of all the films
#' 5. Remove whitespace, etc.
#' 
## ------------------------------------------------------------------------
HP_cast_list <- vector("list", length(HP_URLs))

#' 
#' 
#' Looping
#' ===
#' 
#' Consolidate the work done for GoT into a loop for HP:
## ----HP_loop, warning=FALSE, cache=TRUE----------------------------------
for(i in seq_along(HP_URLs)) {
    HP_cast_list[[i]] <- read_html(HP_URLs[i]) %>%
        html_nodes(".itemprop .itemprop , .character div") %>%
        html_text() %>%
        matrix(ncol = 2, byrow = TRUE) %>%
        data.frame(stringsAsFactors = FALSE)
    colnames(HP_cast_list[[i]]) <- c("Actor", "HP_character")
}
HP_cast <- bind_rows(HP_cast_list, .id = "HP_film") %>%
    mutate_each(funs(str_trim))

#' 
#' Who was in both?
#' ===
#' incremental: true
#' 
## ------------------------------------------------------------------------
both_GoT_HP <- HP_cast %>%
    inner_join(got_cast, by = "Actor") %>%
    arrange(desc(Episodes), Actor)

#' 
#' * ![aragog pycelle](http://assets.cdn.moviepilot.de/files/b9494cbab8d744871de233c28109d0406548a64732c5f2baf993d88bf4d0/limit/1000/1000/daGWgAqX.jpg)
#' 
#' 
#' Other ways of getting data off the web
#' ===
#' 
#' Specialized packages for specific services:
#' 
#' * `twitteR` (Twitter REST API), `streamR` (Twitter streaming API), `Rfacebook`
#'     + Require you get a key to run queries -- store in separate file and pull in, do not hardcode/share with others!
#'     + Rate limiting can be challenge, use `Sys.sleep(seconds)` if needed to slow code down
#'     
#' General API access:
#' 
#' * `httr` for HTTP requests and responses
#' * `jsonlite` for parsing JSON, `XML` for XML
#' 
#' Many tutorials just a Google search away!
#'     
#' 
#' 
#' Text mining with tm
#' ===
#' type: section
#' 
#' 
#' Text mining terminology
#' ===
#' 
#' * `tm`: R package for performing text mining
#' * Term: word
#' * Document: collection of terms
#' * Corpus: a collection of documents (plural: corpora)
#' * Dictionary: set of relevant terms
#' 
#' 
#' My first corpus
#' ===
#' 
#' We can make a toy corpus manually by creating a character vector, running `VectorSource` on it to read it in, and then `VCorpus` to corpus-ify:
#' 
## ------------------------------------------------------------------------
library(tm)
UW_tweets <- c("Remembering and honoring those who made the ultimate sacrifice while serving our country. #MemorialDay2016",
               "VIDEO: This spring @UW students taught literacy arts to #Colville Reservation students. Check out book they made!",
               "Enjoy the long weekend, Huskies! And to those studying for finals: Good luck and hang in there!",
               ".@UWBuerk & @UWFosterSchool–hosted biz plan competition awards $85,000 to students for new ventures. http://ow.ly/3PtI300F87Y  #UWinnovates")
toy_corpus <- VCorpus(VectorSource(UW_tweets))

#' 
#' 
#' Accessing corpus entries
#' ===
#' 
#' A corpus is just a fancy list of documents, and you can access a document as a list entry:
#' 
## ------------------------------------------------------------------------
toy_corpus[[3]]
as.character(toy_corpus[[3]])

#' 
#' 
#' Text files as documents
#' ===
#' 
#' You will more likely be making corpora from sources like Twitter or reading in data from text files. 
#' 
#' We'll import a sample of emails from the [Enron corpus](http://bailando.sims.berkeley.edu/enron_email.html) assembled by UC Berkeley students. First, let's download a ZIP file with the text files and unzip it.
#' 
## ----eval=FALSE----------------------------------------------------------
download.file("https://www.dropbox.com/s/qrd1j44qnlzg68a/enron_sample_emails.zip?dl=1",
              destfile = "enron_emails.zip",
              mode = "wb")
unzip("enron_emails.zip",
      exdir = "enron_emails")

#' 
## ------------------------------------------------------------------------
length(list.files("enron_emails/enron_sample_emails"))

#' 
#' 
#' Reading in text files
#' ===
#' 
#' Make a corpus where each document is an email in the Enron subsample:
#' 
## ----cache=TRUE----------------------------------------------------------
enron_corpus <- VCorpus(DirSource(directory = "enron_emails/enron_sample_emails",
                                  mode = "text"))
as.character(enron_corpus[[3]])

#' 
#' Transformations (maps)
#' ===
#' 
#' Let's change to lowercase, remove "stopwords" and header terms, remove punctuation, numbers, and whitespace, and "stem" the words:
#' 
## ----cache=TRUE----------------------------------------------------------
# install.packages("SnowballC") # may solve errors
enron_stripped <- enron_corpus %>%
    tm_map(content_transformer(str_to_lower)) %>%
    tm_map(removeWords, stopwords("english")) %>%
    tm_map(removeWords, c("javamail.evans@thyme", "message-id", "date",
                          "subject", "mime-version", "content-type",
                          "text/plain", "charset=us-ascii",
                          "content-transfer-encoding", "x-", "x-cc",
                          "x-bcc", "x-folder", "x-origin",
                          "x-filename")) %>%
    tm_map(removePunctuation) %>%
    tm_map(removeNumbers) %>%
    tm_map(stripWhitespace) %>%
    tm_map(stemDocument)

#' 
#' Word clouds
#' ===
#' 
## ---- fig.width = 10, fig.height = 4, dpi=300, out.width="1100px", out.height="440px"----
library(wordcloud)
wordcloud(enron_stripped, min.freq = 2, max.words = 80)

#' 
#' 
#' Filtering to emails with California
#' ===
#' 
#' We'll write a function that takes the content of the documents and looks for any instance of `"california"`, then use it with `tm_filter`:
#' 
## ---- cache=TRUE---------------------------------------------------------
doc_word_search <- function(x, pattern) {
    any(str_detect(content(x), pattern = pattern))
}
cali_emails <- enron_stripped %>%
    tm_filter(doc_word_search, pattern = "california")
length(cali_emails)

#' 
#' 
#' Term-Document Matrices
#' ===
#' 
#' We can look for patterns across the documents by constructing a `TermDocumentMatrix`:
#' 
## ---- cache=TRUE---------------------------------------------------------
enron_tdm <- TermDocumentMatrix(enron_stripped)
str(enron_tdm)

#' 
#' What does the matrix look like?
#' ===
#' 
#' Too big to view at once, but we can look at snippets with `inspect`:
#' 
## ------------------------------------------------------------------------
inspect(enron_tdm[1:5, 1:5])

#' 
#' Removing sparse words
#' ===
#' 
#' We could focus on words that appear in at least 40% of documents.
#' 
## ------------------------------------------------------------------------
enron_tdm_sparse <- removeSparseTerms(enron_tdm, 0.60)
inspect(enron_tdm_sparse)

#' 
#' 
#' Favorite dictionary words
#' ===
#' 
#' Or we can make make a term-document matrix focusing on words in a dictionary and look at just those columns:
#' 
## ------------------------------------------------------------------------
inspect(TermDocumentMatrix(enron_stripped, list(dictionary = c("california", "utah", "texas")))[, 1:5])

#' 
#' 
#' Most frequent words
#' ===
#' 
#' Which terms appear at least 200 times?
#' 
## ------------------------------------------------------------------------
findFreqTerms(enron_tdm, 200)

#' 
#' 
#' Word associations
#' ===
#' 
#' Which words co-occur frequently with "california"?
## ------------------------------------------------------------------------
findAssocs(enron_tdm, "california", 0.90)