---
title: 'Lab 11: Text-as-Data'
author: "PPOL 670-01"
date: '2023-04-05'
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = TRUE, warning = FALSE)

library(dplyr)
library(tidytext)
library(ggplot2)
library(rvest)
library(ggwordcloud)
```

## Proj. 1: Working with Climate Change Tweets
### Load Data
```{r loaddata}
# A collection of tweets mentioning "climate change"
## (Apologies in advance if there is any... profanity)
tweets <- read.csv('https://github.com/apodkul/ppol670_01/raw/main/Data/Climate_tweets.csv')
  
tweets %>% 
  glimpse()
```


### Work with Stop Words and Tokenization
```{r stopwords}
tidytext::stop_words %>% 
  head()

tidytext::stop_words %>% 
  dplyr::group_by(lexicon) %>%
  dplyr::summarize(n = n())

tweets %>% 
  mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text')

tweets %>% 
  mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text') %>%
  anti_join(stop_words)

tweets <- tweets %>% 
  mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text') %>%
  anti_join(stop_words) 

```

### Make a Word Cloud (if you have to)
```{r wordcloud}
# get word counts 
tweets %>% 
  dplyr::count(word) %>%
  arrange(desc(n))

tweets %>% 
  dplyr::count(word) %>%
  arrange(desc(n)) %>%
  filter(n > 50) %>%
  ggplot(aes(label = word, size = n)) + 
  geom_text_wordcloud()

```


### Dictionary-based Sentiment Analysis
```{r sent}
tweets <- read.csv('https://github.com/apodkul/ppol670_01/raw/main/Data/Climate_tweets.csv')

bing_words <- tidytext::get_sentiments('bing')

bing_words %>% 
  glimpse()


tweets %>% 
  mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text') %>%
  anti_join(stop_words) %>%
  inner_join(bing_words)

tweets %>% 
  dplyr::mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text') %>%
  anti_join(stop_words) %>%
  dplyr::inner_join(bing_words) %>% 
  dplyr::group_by(tweet_id, sentiment) %>%
  dplyr::summarize(n = n())

# change dictionary, may require download of `textdata` package
nrc_words <- tidytext::get_sentiments(lexicon = 'nrc')

tweets %>% 
  dplyr::mutate(tweet_id = 1:nrow(tweets)) %>%
  dplyr::select(tweet_id, text) %>%
  unnest_tokens(output = 'word', input = 'text') %>%
  anti_join(stop_words) %>%
  dplyr::inner_join(nrc_words) %>% 
  dplyr::group_by(tweet_id, sentiment) %>%
  dplyr::summarize(n = n())

```


## Proj. 2: Congressional Press Releases

### Get data
```{r}
# Goal: to scrape the 10 most recent press releases from Marjorie Taylor Greene's congressional website

library(rvest)

list_of_links <- read_html('https://greene.house.gov/news/documentquery.aspx?DocumentTypeID=27') %>%
  html_nodes('a') %>%
  html_attr('href')
  
list_of_links <- list_of_links[stringr::str_detect(list_of_links, '/news/documentsingle')] %>% unique()

output_data <- list()
for(i in 1:length(list_of_links)){
  tmp_var <- read_html(stringr::str_c('https://greene.house.gov', 
                                                 list_of_links[i])) %>%
      html_nodes('.bodycopy') %>%
      html_text2()
  output_data[[i]] <- tmp_var
}

press_releases <- data.frame(id = 1:length(output_data), 
                             text = unlist(output_data))

#View(press_releases)

```


### Prep data for analysis 
```{r}
press_releases %>% 
  unnest_tokens(output = 'word', input = 'text') %>% 
  anti_join(stop_words, by = c('word' = 'word')) %>%
  dplyr::count(id, word) 

press_releases <- press_releases %>% 
  unnest_tokens(output = 'word', input = 'text') %>% 
  anti_join(stop_words, by = c('word' = 'word')) %>%
  dplyr::count(id, word, name = 'count') 

pr_input <- press_releases %>%
  cast_dtm(id, word, count)

pr_input
```


### Estimate LDA Model (2 topics)
```{r}
library(topicmodels)
pr_lda <- LDA(pr_input, k = 2, 
              control = list(seed = 1789))
```


### Summarize and Visualize Model Outputs 
```{r}
pr_topics <- tidy(pr_lda, matrix = 'beta')
pr_topics

pr_top_terms <- pr_topics %>% 
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta))

ggplot(pr_top_terms) + 
  geom_bar(aes(x = beta, y = term, fill = as.factor(topic)), 
           stat = 'identity') + 
  facet_wrap(~topic, scales = 'free')  + 
  theme(legend.position = 'none')

gg <- pr_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  tidyr::pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .005 | topic2 > .005) %>%
  mutate(log_ratio = log2(topic2 / topic1)) %>%
  arrange(log_ratio) %>%
  mutate(term = reorder(term, log_ratio))

ggplot(gg) + 
  geom_bar(aes(x = log_ratio, y = term), 
           stat = 'identity')


# Get Gamma Terms 
gammas <- tidy(pr_lda, matrix = 'gamma')

gammas$document <- factor(gammas$document, levels = 1:10)
ggplot(gammas) + 
  geom_bar(aes(x = topic, y = gamma), stat = 'identity') + 
  facet_wrap(~document)

```