--- output: md_document: variant: markdown_github --- # Descriptive Analysis of Texts ### Kenneth Benoit ### 24 April 2017 ```{r, echo = FALSE} knitr::opts_chunk$set( collapse = TRUE ) ``` quateda has a number of descriptive statistics available for reporting on texts. The **simplest of these** is through the `summary()` method: ```{r} require(quanteda) txt <- c(sent1 = "This is an example of the summary method for character objects.", sent2 = "The cat in the hat swung the bat.") summary(txt) ``` This also works for corpus objects: ```{r} summary(corpus(data_char_ukimmig2010, notes = "Created as a demo.")) ``` To access the **syllables** of a text, we use `syllables()`: ```{r} nsyllable(c("Superman.", "supercalifragilisticexpialidocious", "The cat in the hat.")) ``` We can even compute the **Scabble value** of English words, using `scrabble()`: ```{r} nscrabble(c("cat", "quixotry", "zoo")) ``` We can analyze the **lexical diversity** of texts, using `lexdiv()` on a dfm: ```{r} myDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980)) lexstat <- textstat_lexdiv(myDfm, "R") lexstat <- lexstat[order(lexstat[["R"]]), ] with(lexstat, dotchart(R, labels = document)) ``` We can analyze the **readability** of texts, using `readability()` on a vector of texts or a corpus: ```{r} readab <- textstat_readability(corpus_subset(data_corpus_inaugural, Year > 1980), measure = "Flesch.Kincaid") readab <- readab[order(readab[["Flesch.Kincaid"]]), ] with(readab, dotchart(Flesch.Kincaid, labels = document)) ``` We can **identify documents and terms that are similar to one another**, using `similarity()`: ```{r} ## Presidential Inaugural Address Corpus presDfm <- corpus_subset(data_corpus_inaugural, Year > 1980) %>% dfm(remove = stopwords("english"), remove_punct = TRUE) # compute some document similarities textstat_simil(presDfm, "1985-Reagan") textstat_simil(presDfm, c("2009-Obama", "2013-Obama"), method = "cosine") textstat_dist(presDfm, c("2009-Obama", "2013-Obama"), method = "canberra") textstat_dist(presDfm, c("2009-Obama", "2013-Obama"), method = "ejaccard") # compute some term similarities lapply(as.list(textstat_simil(presDfm, c("fair", "health", "terror"), margin = "features", method = "cosine")), head, n = 10) ``` And this can be used for **clustering documents**: ```{r, fig.height=6, fig.width=10} data(data_corpus_sotu, package = "quanteda.corpora") presDfm <- dfm(corpus_subset(data_corpus_sotu, Date > "1990-01-01"), stem = TRUE, remove = c(stopwords("english"), "applause"), remove_punct = TRUE) presDfm <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3) # hierarchical clustering - get distances on normalized dfm presDistMat <- dist(as.matrix(dfm_weight(presDfm, "relFreq"))) # hiarchical clustering the distance object presCluster <- hclust(presDistMat) # label with document names presCluster$labels <- docnames(presDfm) # plot as a dendrogram plot(presCluster) ``` Or we could look at **term clustering** instead: ```{r, fig.height=8, fig.width=12} # word dendrogram with tf-idf weighting wordDfm <- dfm_sort(dfm_tfidf(presDfm)) wordDfm <- t(wordDfm)[1:100,] # because transposed wordDistMat <- dist(wordDfm) wordCluster <- hclust(wordDistMat) plot(wordCluster, xlab="", main="tf-idf Frequency weighting") ``` Finally, there are number of helper functions to extract information from quanteda objects: ```{r, collapse = FALSE} myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980) # return the number of documents ndoc(myCorpus) ndoc(dfm(myCorpus)) # how many tokens (total words) ntoken(myCorpus) ntoken("How many words in this sentence?") # arguments to tokenize can be passed ntoken("How many words in this sentence?", remove_punct = TRUE) # how many types (unique words) ntype(myCorpus) ntype("Yada yada yada. (TADA.)") ntype("Yada yada yada. (TADA.)", remove_punct = TRUE) ntype(char_tolower("Yada yada yada. (TADA.)"), remove_punct = TRUE) # can count documents and features ndoc(data_corpus_inaugural) myDfm1 <- dfm(data_corpus_inaugural) ndoc(myDfm1) nfeature(myDfm1) myDfm2 <- dfm(data_corpus_inaugural, remove = stopwords("english"), stem = TRUE) nfeature(myDfm2) myDfm3 <- dfm(data_corpus_inaugural, remove = stopwords("english"), remove_punct = TRUE, stem = TRUE) nfeature(myDfm3) # can extract feature labels and document names head(featnames(myDfm1), 20) head(docnames(myDfm1)) # and topfeatures topfeatures(myDfm1) topfeatures(myDfm2) # without stopwords topfeatures(myDfm3) # without stopwords or punctuation ```