---
title: "Untitled"
author: "Ben Marwick"
date: "March 30, 2015"
output: html_document
---


Additional topics for the AAAS text mining workshop

```{r}
# wordclouds
library("wordcloud")

#  words and their frequencies from DTM
m <- t(as.matrix(my_dtm_2_sparse_stopwords))
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

windows()
# basic 
wordcloud(d$word,d$freq)
# with colours
wordcloud(d$word,d$freq, colors = brewer.pal(6,"Dark2"), random.order = FALSE)
```

```{r}
# cluster analysis

# find groups of respondants using similar words
windows()
plot(hclust(dist(my_dtm_2_sparse_stopwords)))

library(dplyr)
my_dtm_2_sparse_stopwords %>%
  dist %>%
  hclust %>%
  plot

# find groups of words that are used together
plot(hclust(dist(t(my_dtm_2_sparse_stopwords))))
```


```{r}

# topic models
library(topicmodels)

# adapted from http://stackoverflow.com/questions/16004847/visualise-distances-between-texts/16010600

# Find the sum of words in each Document
rowTotals <- apply(my_dtm_2_sparse_stopwords , 1, sum) 

# remove all docs without words
my_dtm_sparse_stopwords_   <- my_dtm_2_sparse_stopwords[rowTotals > 0, ]           

# Generate topic model
k <- 10
my_topics <- LDA(my_dtm_sparse_stopwords_, k)

# top five words per topic
Terms <- terms(my_topics, 5)

# top five topics per document
topics(my_topics, 5)

# visualise network graph
# make data frame where rows are documents, columns are topics and cells 
# are posterior probabilities of topics
my_topic_df <- setNames(as.data.frame(my_topics@gamma),  paste0("topic_",1:k))

#### Euclidean distance matrix
library(cluster)
my_topic_df_dist <-  as.matrix(daisy(my_topic_df, metric =  "euclidean", stand = TRUE))
# Change row values to zero if less than row minimum plus row standard deviation
# This is how Jockers subsets the distance matrix to keep only 
# closely related documents and avoid a dense spagetti diagram 
# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
my_topic_df_dist[ sweep(my_topic_df_dist, 1, (apply(my_topic_df_dist,1,min) + apply(my_topic_df_dist,1,sd) )) > 0 ] <- 0

#### network diagram using Fruchterman & Reingold algorithm (Jockers uses the ForceAtlas2 algorithm which is unique to Gephi)
library(igraph)
g <- as.undirected(graph.adjacency(my_topic_df_dist))
layout1 <- layout.fruchterman.reingold(g, niter=500)

windows()
plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1,  vertex.color= "grey", edge.arrow.size = 0.1, vertex.label.dist=0.5, vertex.label = NA)

# this line will export from R and make the file 'my.graphml' in your working directory ready to open with Gephi
write.graph(g, file="my.graphml", format="graphml")

```


```{r}
# reading PDFs

# Tell R what folder contains your 1000s of PDFs
dest <- "C:/Users/marwick/Desktop/bens_dc_clone/PDFs"
 
# make a vector of PDF file names
myfiles <- list.files(path = dest, pattern = ".pdf|.PDF", full.names = TRUE)

# https://gist.github.com/benmarwick/11333467
############### PDF (text format) to TXT ###################
 
                  ##### Wait! #####
# Before proceeding, make sure you have a copy of pdf2text
# on your computer! Details: https://en.wikipedia.org/wiki/Pdftotext
# Download: http://www.foolabs.com/xpdf/download.html
 
# If you have a PDF with text, ie. you can open the PDF in a 
# PDF viewer and select text with your curser, then use these 
# lines to convert each PDF file that is named in the vector 
# into text file is created in the same directory as the PDFs
# note that my pdftotext.exe is in a different location to yours
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )
 
# where are the txt files you just made?
dest # in this folder
 
# And now you're ready to do some text mining on the text files
```

```{r}
# reading MS Word docs
library(tm)
readDOC(...) #  needs the tool antiword installed and accessible on your system

```


```{r}
# scraping text from the web
library(rvest)
library(dplyr)

webpage <- html(...)

# identify elements on the page... using selectorgadget
text_node <- ".yt-lockup-meta-info li:nth-child(1)"


# extract text on page
my_web_text <- webpage %>% 
  html_nodes(viewtext_nodes_node) %>%
  html_text() 


```