---
output:
  md_document:
    variant: markdown_github
---

# Advanced Analysis of Texts

### Kenneth Benoit
### 28 March 2018

```{r, echo = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE
)
```
In this section we will explore some text analysis and analysis of metadata from a corpus of tweets retrieved from the Twitter API. The tweets are a small sample from a collection of tweets relating to the European Parliament elections of 2015.

Load the data frame containing the sample tweets:

```{r}
require(quanteda)
load("tweetSample.RData")
str(tweetSample)
```


```{r}
require(lubridate)
require(dplyr)
tweetSample <- mutate(tweetSample, day = yday(created_at))
tweetSample <- mutate(tweetSample, dayDate = as.Date(day-1, origin = "2014-01-01"))
juncker <- filter(tweetSample, grepl('juncker', text, ignore.case = TRUE)) %>% 
    mutate(kand = 'Juncker')
schulz <-  filter(tweetSample, grepl('schulz', text, ignore.case = TRUE)) %>% 
    mutate(kand = 'Schulz')
verhof <-  filter(tweetSample, grepl('verhofstadt', text, ignore.case = TRUE)) %>% 
    mutate(kand = 'Verhofstadt')
spitzAll <- bind_rows(juncker, schulz, verhof)
```

Once the data is in the correct format, we can use ggplot to display the candidate mentions on the a single plot:


```{r}
require(ggplot2)
require(scales)
# mentioning kandidates names over time
plotDf <- count(spitzAll, kand, day=day) %>% 
    mutate(day = as.Date(day-1, origin = "2014-01-01"))

ggplot(data=plotDf, aes(x=day, y=n, colour=kand)) + 
    geom_line(size=1) +
    scale_y_continuous(labels = comma) + geom_vline(xintercept=as.numeric(as.Date("2014-05-15")), linetype=4) +
    geom_vline(xintercept=as.numeric(as.Date("2014-05-25")), linetype=4) +
    theme(axis.text=element_text(size=12),
          axis.title=element_text(size=14,face="bold"))
```


We can use the `keptFeatures` argument to `dfm()` to analyse only hashtags for each candidate's text.
```{r}
# Top hashtags for tweets that mention Juncker
dv <- data.frame(user = juncker$user_screen_name)
jCorp <- corpus(juncker$text, docvars = dv)
jd <- dfm(jCorp)
jd <- dfm_select(jd, "^#.+", "keep", valuetype = "regex") 
# equivalent: jd <- dfm_select(jd, "#*", "keep", valuetype = "glob") 
topfeatures(jd, nfeature(jd))
```


## Further analysis examples

Wordscores:
```{r}
data(data_corpus_amicus, package = "quanteda.corpora")
refs <- docvars(data_corpus_amicus, "trainclass")
refs <- (as.numeric(refs) - 1.5)*2
amicusDfm <- dfm(data_corpus_amicus)
wm <- textmodel_wordscores(amicusDfm, y = refs)
summary(wm)
preds <- predict(wm, newdata = amicusDfm)
summary(preds)
plot(preds ~ docvars(amicusDfm, "testclass"),
     horizontal = TRUE, xlab = "Predicted document score",
     ylab = "Test class", las = 1)
```

Correspondence analysis:
```{r, fig.width = 8, fig.height = 5}
dfm(data_corpus_irishbudget2010) %>%
    textmodel_ca() %>% 
    textplot_scale1d()
```

Poisson scaling:
```{r, fig.width = 8, fig.height = 5}
ieWF <- dfm(data_corpus_irishbudget2010, remove_punct = TRUE) %>%
    textmodel_wordfish(dir = c(6,5))
summary(ieWF)
textplot_scale1d(ieWF)
```


Topic models:
```{r}
require(topicmodels)
mycorpus <- corpus_subset(data_corpus_inaugural, Year > 1950)
quantdfm <- dfm(mycorpus, verbose = FALSE, remove_punct = TRUE,
                remove = c(stopwords('english'), 'will', 'us', 'nation', 'can', 'peopl*', 'americ*'))
ldadfm <- convert(quantdfm, to = "topicmodels")
lda <- LDA(ldadfm, control = list(alpha = 0.1), k = 20)
terms(lda, 10)
```