--- title: 'Lab 11: Text-as-Data' author: "PPOL 670-01" date: '2023-04-05' output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval = TRUE, warning = FALSE) library(dplyr) library(tidytext) library(ggplot2) library(rvest) library(ggwordcloud) ``` ## Proj. 1: Working with Climate Change Tweets ### Load Data ```{r loaddata} # A collection of tweets mentioning "climate change" ## (Apologies in advance if there is any... profanity) tweets <- read.csv('https://github.com/apodkul/ppol670_01/raw/main/Data/Climate_tweets.csv') tweets %>% glimpse() ``` ### Work with Stop Words and Tokenization ```{r stopwords} tidytext::stop_words %>% head() tidytext::stop_words %>% dplyr::group_by(lexicon) %>% dplyr::summarize(n = n()) tweets %>% mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') tweets %>% mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words) tweets <- tweets %>% mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words) ``` ### Make a Word Cloud (if you have to) ```{r wordcloud} # get word counts tweets %>% dplyr::count(word) %>% arrange(desc(n)) tweets %>% dplyr::count(word) %>% arrange(desc(n)) %>% filter(n > 50) %>% ggplot(aes(label = word, size = n)) + geom_text_wordcloud() ``` ### Dictionary-based Sentiment Analysis ```{r sent} tweets <- read.csv('https://github.com/apodkul/ppol670_01/raw/main/Data/Climate_tweets.csv') bing_words <- tidytext::get_sentiments('bing') bing_words %>% glimpse() tweets %>% mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words) %>% inner_join(bing_words) tweets %>% dplyr::mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words) %>% dplyr::inner_join(bing_words) %>% dplyr::group_by(tweet_id, sentiment) %>% dplyr::summarize(n = n()) # change dictionary, may require download of `textdata` package nrc_words <- tidytext::get_sentiments(lexicon = 'nrc') tweets %>% dplyr::mutate(tweet_id = 1:nrow(tweets)) %>% dplyr::select(tweet_id, text) %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words) %>% dplyr::inner_join(nrc_words) %>% dplyr::group_by(tweet_id, sentiment) %>% dplyr::summarize(n = n()) ``` ## Proj. 2: Congressional Press Releases ### Get data ```{r} # Goal: to scrape the 10 most recent press releases from Marjorie Taylor Greene's congressional website library(rvest) list_of_links <- read_html('https://greene.house.gov/news/documentquery.aspx?DocumentTypeID=27') %>% html_nodes('a') %>% html_attr('href') list_of_links <- list_of_links[stringr::str_detect(list_of_links, '/news/documentsingle')] %>% unique() output_data <- list() for(i in 1:length(list_of_links)){ tmp_var <- read_html(stringr::str_c('https://greene.house.gov', list_of_links[i])) %>% html_nodes('.bodycopy') %>% html_text2() output_data[[i]] <- tmp_var } press_releases <- data.frame(id = 1:length(output_data), text = unlist(output_data)) #View(press_releases) ``` ### Prep data for analysis ```{r} press_releases %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words, by = c('word' = 'word')) %>% dplyr::count(id, word) press_releases <- press_releases %>% unnest_tokens(output = 'word', input = 'text') %>% anti_join(stop_words, by = c('word' = 'word')) %>% dplyr::count(id, word, name = 'count') pr_input <- press_releases %>% cast_dtm(id, word, count) pr_input ``` ### Estimate LDA Model (2 topics) ```{r} library(topicmodels) pr_lda <- LDA(pr_input, k = 2, control = list(seed = 1789)) ``` ### Summarize and Visualize Model Outputs ```{r} pr_topics <- tidy(pr_lda, matrix = 'beta') pr_topics pr_top_terms <- pr_topics %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% arrange(topic, -beta) %>% mutate(term = reorder(term, beta)) ggplot(pr_top_terms) + geom_bar(aes(x = beta, y = term, fill = as.factor(topic)), stat = 'identity') + facet_wrap(~topic, scales = 'free') + theme(legend.position = 'none') gg <- pr_topics %>% mutate(topic = paste0("topic", topic)) %>% tidyr::pivot_wider(names_from = topic, values_from = beta) %>% filter(topic1 > .005 | topic2 > .005) %>% mutate(log_ratio = log2(topic2 / topic1)) %>% arrange(log_ratio) %>% mutate(term = reorder(term, log_ratio)) ggplot(gg) + geom_bar(aes(x = log_ratio, y = term), stat = 'identity') # Get Gamma Terms gammas <- tidy(pr_lda, matrix = 'gamma') gammas$document <- factor(gammas$document, levels = 1:10) ggplot(gammas) + geom_bar(aes(x = topic, y = gamma), stat = 'identity') + facet_wrap(~document) ```