--- output: md_document: variant: markdown_github --- # Advanced Analysis of Texts ### Kenneth Benoit ### 28 March 2018 ```{r, echo = FALSE} knitr::opts_chunk$set( collapse = TRUE ) ``` In this section we will explore some text analysis and analysis of metadata from a corpus of tweets retrieved from the Twitter API. The tweets are a small sample from a collection of tweets relating to the European Parliament elections of 2015. Load the data frame containing the sample tweets: ```{r} require(quanteda) load("tweetSample.RData") str(tweetSample) ``` ```{r} require(lubridate) require(dplyr) tweetSample <- mutate(tweetSample, day = yday(created_at)) tweetSample <- mutate(tweetSample, dayDate = as.Date(day-1, origin = "2014-01-01")) juncker <- filter(tweetSample, grepl('juncker', text, ignore.case = TRUE)) %>% mutate(kand = 'Juncker') schulz <- filter(tweetSample, grepl('schulz', text, ignore.case = TRUE)) %>% mutate(kand = 'Schulz') verhof <- filter(tweetSample, grepl('verhofstadt', text, ignore.case = TRUE)) %>% mutate(kand = 'Verhofstadt') spitzAll <- bind_rows(juncker, schulz, verhof) ``` Once the data is in the correct format, we can use ggplot to display the candidate mentions on the a single plot: ```{r} require(ggplot2) require(scales) # mentioning kandidates names over time plotDf <- count(spitzAll, kand, day=day) %>% mutate(day = as.Date(day-1, origin = "2014-01-01")) ggplot(data=plotDf, aes(x=day, y=n, colour=kand)) + geom_line(size=1) + scale_y_continuous(labels = comma) + geom_vline(xintercept=as.numeric(as.Date("2014-05-15")), linetype=4) + geom_vline(xintercept=as.numeric(as.Date("2014-05-25")), linetype=4) + theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold")) ``` We can use the `keptFeatures` argument to `dfm()` to analyse only hashtags for each candidate's text. ```{r} # Top hashtags for tweets that mention Juncker dv <- data.frame(user = juncker$user_screen_name) jCorp <- corpus(juncker$text, docvars = dv) jd <- dfm(jCorp) jd <- dfm_select(jd, "^#.+", "keep", valuetype = "regex") # equivalent: jd <- dfm_select(jd, "#*", "keep", valuetype = "glob") topfeatures(jd, nfeature(jd)) ``` ## Further analysis examples Wordscores: ```{r} data(data_corpus_amicus, package = "quanteda.corpora") refs <- docvars(data_corpus_amicus, "trainclass") refs <- (as.numeric(refs) - 1.5)*2 amicusDfm <- dfm(data_corpus_amicus) wm <- textmodel_wordscores(amicusDfm, y = refs) summary(wm) preds <- predict(wm, newdata = amicusDfm) summary(preds) plot(preds ~ docvars(amicusDfm, "testclass"), horizontal = TRUE, xlab = "Predicted document score", ylab = "Test class", las = 1) ``` Correspondence analysis: ```{r, fig.width = 8, fig.height = 5} dfm(data_corpus_irishbudget2010) %>% textmodel_ca() %>% textplot_scale1d() ``` Poisson scaling: ```{r, fig.width = 8, fig.height = 5} ieWF <- dfm(data_corpus_irishbudget2010, remove_punct = TRUE) %>% textmodel_wordfish(dir = c(6,5)) summary(ieWF) textplot_scale1d(ieWF) ``` Topic models: ```{r} require(topicmodels) mycorpus <- corpus_subset(data_corpus_inaugural, Year > 1950) quantdfm <- dfm(mycorpus, verbose = FALSE, remove_punct = TRUE, remove = c(stopwords('english'), 'will', 'us', 'nation', 'can', 'peopl*', 'americ*')) ldadfm <- convert(quantdfm, to = "topicmodels") lda <- LDA(ldadfm, control = list(alpha = 0.1), k = 20) terms(lda, 10) ```