# Data Import and Preprocessing

In this notebook, we demonstrate how to read text data in R, tokenize texts and create a document-term matrix.

We start by loading the required dependencies.

In [None]:
require(quanteda)
require(magrittr)

The corpus we will work with is a collection of blogposts about American politics written in 2008 put together by the Carnegie Mellon University 2008 Political Blog Corpus ([Eisenstein & Xing 2010](http://www.sailing.cs.cmu.edu/main/socialmedia/blog2008.pdf)).

In [None]:
poliblogs2008 <- read.csv("data/poliblogs2008.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"", stringsAsFactors = F)
head(poliblogs2008,2) # inspect the first 2 documents

In [None]:
table(poliblogs2008$rating)

In [None]:
table(poliblogs2008$blog)

In [None]:
data_corpus <- corpus(poliblogs2008, text_field = "documents")

In [None]:
DTM.1 <- data_corpus %>% tokens() %>%
 tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% dfm() 
DTM.1

## Simple Frequency Analysis

In [None]:
wordlist <- textstat_frequency(DTM.1)
head(wordlist, 20)

In [None]:
plot(wordlist$frequency , type = "l", lwd=2, main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency")

In [None]:
plot(wordlist$frequency , type = "l", log="xy", lwd=2, main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency")

In [None]:
stopw_idx <- which(wordlist$feature %in% stopwords('en'))
low_frequent_idx <- which(wordlist$frequency < 10)
trash_idx <- union(stopw_idx, low_frequent_idx)
vocab_idx <- setdiff(1:nrow(wordlist), trash_idx)

In [None]:
plot(wordlist$frequency, type = "l", log="xy",lwd=2, main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency")
lines(vocab_idx, wordlist$frequency[vocab_idx], col = "green", lwd=2, type="p", pch=20)

In [None]:
head(wordlist[vocab_idx], 20)

In [None]:
head(wordlist[trash_idx], 20)

In [None]:
DTM.2 <- dfm_remove(DTM.1, wordlist[trash_idx]$feature)
DTM.2

In [None]:
textplot_wordcloud(DTM.2, max_words = 100)

## Finding Important Words in a Document
**T**erm **F**requency–**I**nverse **D**ocument **F**requency (**TF-IDF**), is intended to reflect the importance of a word in a document.

In [None]:
some_docname <- "at0800300_2.text"
print(poliblogs2008[poliblogs2008$docname == some_docname, ]$documents)

In [None]:
number_of_docs <- nrow(DTM.2)
term_in_docs <- colSums(DTM.2 > 0)
idf <- log2(number_of_docs / term_in_docs)

In [None]:
tf <- as.vector(DTM.2[poliblogs2008[poliblogs2008$docname == some_docname, ]$X, ])

In [None]:
tf_idf <- tf * idf
names(tf_idf) <- colnames(DTM.2)
head(sort(tf_idf, decreasing = T),10)

## Working with Dictionaries

In [None]:
positive_terms <- data_dictionary_LSD2015$positive
negative_terms <- data_dictionary_LSD2015$negative

In [None]:
positive_terms_in_suto <- intersect(colnames(DTM.2), positive_terms)
counts_positive <- rowSums(DTM.2[, positive_terms_in_suto])

negative_terms_in_suto <- intersect(colnames(DTM.2), negative_terms)
counts_negative <- rowSums(DTM.2[, negative_terms_in_suto])

In [None]:
counts_all_terms <- rowSums(DTM.2)

relative_sentiment_frequencies <- data.frame(
 docname = docvars(DTM.2)$docname,
 positive = counts_positive / counts_all_terms,
 negative = counts_negative / counts_all_terms
)

In [None]:
head(relative_sentiment_frequencies,5)

In [None]:
# save(DTM.2, file = "data/DTM.2.RData")