--- output: md_document: variant: gfm --- # Preparing texts for analysis ### Kenneth Benoit ### 24 April 2017 ```{r, echo = FALSE} knitr::opts_chunk$set( collapse = TRUE ) ``` Here we will step through the basic elements of preparing a text for analysis. These are tokenization, conversion to lower case, stemming, removing or selecting features, and defining equivalency classes for features, including the use of dictionaries. ### 1. Tokenization Tokenization in quanteda is very *conservative*: by default, it only removes separator characters. ```{r} require(quanteda, quietly = TRUE, warn.conflicts = FALSE) txt <- c(text1 = "This is $10 in 999 different ways,\n up and down; left and right!", text2 = "@kenbenoit working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.") tokens(txt) tokens(txt, verbose = TRUE) tokens(txt, remove_numbers = TRUE, remove_punct = TRUE) tokens(txt, remove_numbers = FALSE, remove_punct = TRUE) tokens(txt, remove_numbers = TRUE, remove_punct = FALSE) tokens(txt, remove_numbers = FALSE, remove_punct = FALSE) tokens(txt, remove_numbers = FALSE, remove_punct = FALSE, remove_separators = FALSE) ``` There are several options to the `what` argument: ```{r} # sentence level tokens(c("Kurt Vongeut said; only assholes use semi-colons.", "Today is Thursday in Canberra: It is yesterday in London.", "Today is Thursday in Canberra: \nIt is yesterday in London.", "To be? Or\nnot to be?"), what = "sentence") tokens(data_corpus_inaugural[2], what = "sentence") # character level tokens("My big fat text package.", what = "character") tokens("My big fat text package.", what = "character", remove_separators = FALSE) ``` Two other options, for really fast and simple tokenization are `"fastestword"` and `"fasterword"`, if performance is a key issue. These are less intelligent than the boundary detection used in the default `"word"` method, which is based on stringi/ICU boundary detection. ### 2. Conversion to lower case This is a tricky one in our workflow, since it is a form of equivalency declaration, rather than a tokenization step. It turns out that it is more efficient to perform at the pre-tokenization stage. The **quanteda** functions `*_tolower` include options designed to preserve acronyms. ```{r} test1 <- c(text1 = "England and France are members of NATO and UNESCO", text2 = "NASA sent a rocket into space.") char_tolower(test1) char_tolower(test1, keep_acronyms = TRUE) test2 <- tokens(test1, remove_punct = TRUE) tokens_tolower(test2) tokens_tolower(test2, keep_acronyms = TRUE) ``` The **quanteda** `*_tolower` functions are based on `stringi::stri_trans_tolower()`, and is therefore nicely Unicode compliant. ```{r} data(data_char_encodedtexts, package = "readtext") # Russian cat(iconv(data_char_encodedtexts[8], "windows-1251", "UTF-8")) cat(tolower(iconv(data_char_encodedtexts[8], "windows-1251", "UTF-8"))) head(char_tolower(stopwords("russian")), 20) head(char_toupper(stopwords("russian")), 20) # Arabic cat(iconv(data_char_encodedtexts[6], "ISO-8859-6", "UTF-8")) cat(tolower(iconv(data_char_encodedtexts[6], "ISO-8859-6", "UTF-8"))) head(char_toupper(stopwords("arabic")), 20) ``` **Note**: dfm, the Swiss army knife, converts to lower case by default, but this can be turned off using the `tolower = FALSE` argument. ### 3. Removing and selecting features This can be done when creating a dfm: ```{r} # with English stopwords and stemming dfmsInaug2 <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"), stem = TRUE) dfmsInaug2 head(dfmsInaug2) ``` Or can be done **after** creating a dfm: ```{r} myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.", "Does the United_States or Sweden have more progressive taxation?"), tolower = FALSE, verbose = TRUE) dfm_select(myDfm, pattern = c("s$", ".y"), selection = "keep", valuetype = "regex") dfm_select(myDfm, pattern = c("s$", ".y"), selection = "remove", valuetype = "regex") dfm_select(myDfm, pattern = stopwords("english"), selection = "keep", valuetype = "fixed") dfm_select(myDfm, pattern = stopwords("english"), selection = "remove", valuetype = "fixed") ``` More examples: ```{r} # removing stopwords testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with the newspaper from a boy named Seamus, in his mouth." testCorpus <- corpus(testText) # note: "also" is not in the default stopwords("english") featnames(dfm(testCorpus, remove = stopwords("english"))) # for ngrams featnames(dfm(testCorpus, ngrams = 2, remove = stopwords("english"))) featnames(dfm(testCorpus, ngrams = 1:2, remove = stopwords("english"))) ## removing stopwords before constructing ngrams tokensAll <- tokens(tolower(testText), remove_punct = TRUE) tokensNoStopwords <- tokens_remove(tokensAll, stopwords("english")) tokensNgramsNoStopwords <- tokens_ngrams(tokensNoStopwords, 2) featnames(dfm(tokensNgramsNoStopwords, verbose = FALSE)) # keep only certain words dfm(testCorpus, select = "*s", verbose = FALSE) # keep only words ending in "s" dfm(testCorpus, select = "s$", valuetype = "regex", verbose = FALSE) # testing Twitter functions testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers", "2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber", "Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber") dfm(testTweets, select = "#*", remove_twitter = FALSE) # keep only hashtags dfm(testTweets, select = "^#.*$", valuetype = "regex", remove_twitter = FALSE) ``` One very nice feature is the ability to create a new dfm with the same feature set as the old. This is very useful, for instance, if we train a model on one dfm, and need to predict on counts from another, but need the feature set to be equivalent. ```{r} # selecting on a dfm textVec1 <- c("This is text one.", "This, the second text.", "Here: the third text.") textVec2 <- c("Here are new words.", "New words in this text.") featnames(dfm1 <- dfm(textVec1)) featnames(dfm2a <- dfm(textVec2)) (dfm2b <- dfm_select(dfm2a, dfm1)) identical(featnames(dfm1), featnames(dfm2b)) ``` ### 4. Applying equivalency classes: dictionaries, thesaruses Dictionary creation is done through the `dictionary()` function, which classes a named list of characters as a dictionary. ```{r} # import the Laver-Garry dictionary from http://bit.ly/1FH2nvf lgdict <- dictionary(file = "LaverGarry.cat") budgdfm <- dfm(data_corpus_irishbudget2010, dictionary = lgdict, verbose = TRUE) head(budgdfm) # import a LIWC formatted dictionary liwcdict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic") dictdfm <- dfm(data_corpus_inaugural, dictionary = liwcdict, verbose = TRUE) dictdfm[50:58, c("money", "power")] ``` We apply dictionaries to a dfm using the `dfm_lookup()` function. Through the `valuetype`, argument, we can match patterns of one of three types: `"glob"`, `"regex"`, or `"fixed"`. ```{r} myDict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"), opposition = c("Opposition", "reject", "notincorpus"), taxglob = "tax*", taxregex = "tax.+$", country = c("United_States", "Sweden"))) myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.", "Does the United_States or Sweden have more progressive taxation?"), remove = stopwords("english"), verbose = FALSE) myDfm # glob format dfm_lookup(myDfm, myDict, valuetype = "glob") dfm_lookup(myDfm, myDict, valuetype = "glob", case_insensitive = FALSE) # regex v. glob format: note that "united_states" is a regex match for "tax*" dfm_lookup(myDfm, myDict, valuetype = "glob") dfm_lookup(myDfm, myDict, valuetype = "regex", case_insensitive = TRUE) # fixed format: no pattern matching dfm_lookup(myDfm, myDict, valuetype = "fixed") dfm_lookup(myDfm, myDict, valuetype = "fixed", case_insensitive = FALSE) ``` It is also possible to pass through a dictionary at the time of `dfm()` creation. ```{r} # dfm with dictionaries mycorpus <- corpus_subset(data_corpus_inaugural, Year > 1900) mydict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"), opposition = c("Opposition", "reject", "notincorpus"), taxing = "taxing", taxation = "taxation", taxregex = "tax*", country = "united states")) dictDfm <- dfm(mycorpus, dictionary = mydict) head(dictDfm) ``` Finally, there is a related "thesaurus" feature, which collapses words in a dictionary but is not exclusive. ```{r} mytexts <- c("British English tokenises differently, with more colour.", "American English tokenizes color as one word.") mydict <- dictionary(list(color = "colo*r", tokenize = "tokeni?e*")) dfm(mytexts, thesaurus = mydict) ``` ### 5. Stemming Stemming relies on the `SnowballC` package's implementation of the Porter stemmer, and is available for the following languages: ```{r} SnowballC::getStemLanguages() ``` It's not perfect: ```{r} char_wordstem(c("win", "winning", "wins", "won", "winner")) ``` but it's fast. Stemmed objects must be tokenized, but can be of many different quanteda classes: ```{r, error = TRUE} txt <- "From 10k packages, quanteda is an text analysis package, for analysing texts." char_wordstem(txt) (toks <- tokens_wordstem(tokens(txt, remove_punct = TRUE))) dfm_wordstem(dfm(toks)) ``` Some text operations can be conducted directly on the dfm: ```{r collapse = FALSE} dfm1 <- dfm(data_corpus_inaugural[1:2], verbose = FALSE) head(featnames(dfm1)) head(featnames(dfm_wordstem(dfm1))) # same as head(dfm(data_corpus_inaugural[1:2], stem = TRUE, verbose = FALSE)) ``` ### 6. `dfm()` and its many options Operates on `character` (vectors), `corpus`, or `tokens` objects, ```{r, eval=FALSE} dfm(x, tolower = TRUE, stem = FALSE, select = NULL, remove = NULL, thesaurus = NULL, dictionary = NULL, valuetype = c("glob", "regex", "fixed"), groups = NULL, verbose = quanteda_options("verbose"), ...) ```