---
output:
  md_document:
    variant: markdown_github
---

# Descriptive Analysis of Texts

### Kenneth Benoit
### 24 April 2017

```{r, echo = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE
)
```
quateda has a number of descriptive statistics available for reporting on texts.  The **simplest of these** is through the `summary()` method:
```{r}
require(quanteda)
txt <- c(sent1 = "This is an example of the summary method for character objects.",
         sent2 = "The cat in the hat swung the bat.")
summary(txt)
```

This also works for corpus objects:
```{r}
summary(corpus(data_char_ukimmig2010, notes = "Created as a demo."))
```

To access the **syllables** of a text, we use `syllables()`:
```{r}
nsyllable(c("Superman.", "supercalifragilisticexpialidocious", "The cat in the hat."))
```

We can even compute the **Scabble value** of English words, using `scrabble()`:
```{r}
nscrabble(c("cat", "quixotry", "zoo"))
```

We can analyze the **lexical diversity** of texts, using `lexdiv()` on a dfm:
```{r}
myDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980))
lexstat <- textstat_lexdiv(myDfm, "R")
lexstat <- lexstat[order(lexstat[["R"]]), ]
with(lexstat, dotchart(R, labels = document))
```

We can analyze the **readability** of texts, using `readability()` on a vector of texts or a corpus:
```{r}
readab <- textstat_readability(corpus_subset(data_corpus_inaugural, Year > 1980), 
                               measure = "Flesch.Kincaid")
readab <- readab[order(readab[["Flesch.Kincaid"]]), ]
with(readab, dotchart(Flesch.Kincaid, labels = document))
```

We can **identify documents and terms that are similar to one another**, using `similarity()`:
```{r}
## Presidential Inaugural Address Corpus
presDfm <- corpus_subset(data_corpus_inaugural, Year > 1980) %>%
    dfm(remove = stopwords("english"), remove_punct = TRUE)
# compute some document similarities
textstat_simil(presDfm, "1985-Reagan")
textstat_simil(presDfm, c("2009-Obama", "2013-Obama"), method = "cosine")
textstat_dist(presDfm, c("2009-Obama", "2013-Obama"), method = "canberra")
textstat_dist(presDfm, c("2009-Obama", "2013-Obama"), method = "ejaccard")

# compute some term similarities
lapply(as.list(textstat_simil(presDfm, c("fair", "health", "terror"), margin = "features", method = "cosine")), 
       head, n = 10)
```

And this can be used for **clustering documents**:
```{r, fig.height=6, fig.width=10}
data(data_corpus_sotu, package = "quanteda.corpora")
presDfm <- dfm(corpus_subset(data_corpus_sotu, Date > "1990-01-01"), stem = TRUE,
               remove = c(stopwords("english"), "applause"), remove_punct = TRUE)
presDfm <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3)
# hierarchical clustering - get distances on normalized dfm
presDistMat <- dist(as.matrix(dfm_weight(presDfm, "relFreq")))
# hiarchical clustering the distance object
presCluster <- hclust(presDistMat)
# label with document names
presCluster$labels <- docnames(presDfm)
# plot as a dendrogram
plot(presCluster)
```

Or we could look at **term clustering** instead:
```{r, fig.height=8, fig.width=12}
# word dendrogram with tf-idf weighting
wordDfm <- dfm_sort(dfm_tfidf(presDfm))
wordDfm <- t(wordDfm)[1:100,]  # because transposed
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab="", main="tf-idf Frequency weighting")
```

Finally, there are number of helper functions to extract information from quanteda objects:
```{r, collapse = FALSE}
myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980)

# return the number of documents
ndoc(myCorpus)           
ndoc(dfm(myCorpus))

# how many tokens (total words)
ntoken(myCorpus)
ntoken("How many words in this sentence?")
# arguments to tokenize can be passed 
ntoken("How many words in this sentence?", remove_punct = TRUE)

# how many types (unique words)
ntype(myCorpus)
ntype("Yada yada yada.  (TADA.)")
ntype("Yada yada yada.  (TADA.)", remove_punct = TRUE)
ntype(char_tolower("Yada yada yada.  (TADA.)"), remove_punct = TRUE)

# can count documents and features
ndoc(data_corpus_inaugural)
myDfm1 <- dfm(data_corpus_inaugural)
ndoc(myDfm1)
nfeature(myDfm1)
myDfm2 <- dfm(data_corpus_inaugural, remove = stopwords("english"), stem = TRUE)
nfeature(myDfm2)
myDfm3 <- dfm(data_corpus_inaugural, remove = stopwords("english"), remove_punct = TRUE, stem = TRUE)
nfeature(myDfm3)

# can extract feature labels and document names
head(featnames(myDfm1), 20)
head(docnames(myDfm1))

# and topfeatures
topfeatures(myDfm1)
topfeatures(myDfm2) # without stopwords
topfeatures(myDfm3) # without stopwords or punctuation
```