--- title: "Notes on Text-Mining course by RDM" author: "best student ever" output: html_document date: "2023-05-14" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # My Notes ## This is a smaller header ### And so on - This is a list item with a **bold** word; - This is a list item with an *italic* word # Session 0: Setup ```{r} #| eval: false install.packages("tidyverse") install.packages("tidytext") install.packages("wordcloud") ``` ```{r} library(tidyverse) library(tidytext) library(wordcloud) ``` # Session 1: counting words ## Reading data ```{r} data_file_name <- ??? data_df <- read_delim(???, delim = ???, col_types = cols(`date-pub` = col_date(format = "%B %d, %Y")) print(???(data_df)) print(???(data_df)) ``` # Tokenization ```{r} tidy_content <- ??? %>% unnest_tokens(word, ???, token= ???) ``` # Cleaning up data ```{r} are_there_na <- ??? %>% ??? %>% ??? ??? ``` ```{r} ??? <- tidy_content[ ??? , ??? ] ``` ```{r} ??? ``` # Counting words 1 ```{r} word_count <- ??? %>% count(???) %>% filter(???) %>% mutate(word = reorder(word, n)) ``` ```{r} word_count_plot <- ??? %>% ggplot(aes(???, ???)) + geom_col() + labs(y = NULL) ??? ``` # Removing stop words ```{r, warning=FALSE, message=FALSE} data(stop_words) tidy_clean_content <- ??? %>% anti_join(???) tidy_clean_content ``` # Counting words 2 ```{r} ??? ``` # Word cloud visualization ```{r} word_cloud_plot <- wordcloud( ??? , ??? ) word_cloud_plot ``` # Session 2: Sentiment Analysis ## Lexicon and joy words ```{r} lexicon_data_file <- ??? nrc_lexicon_df <- read.table(???, sep = ???, col.names = c(???, ???, ???) ) ``` ```{r} joy_words <- ??? %>% filter(???) ``` ## Computing joy words fraction ```{r} issue_df <- ??? %>% filter( ??? >='2000-01-01' & ??? < '2010-01-01') %>% group_by(???) %>% reframe(words_per_issue = n(), date= `date-pub`) %>% unique() ``` ```{r} issue_joy_df <- ??? %>% filter(???) %>% inner_join(???) %>% group_by(???) %>% reframe(joy_words_per_issue = n()) ``` ```{r} issue_tot_df <- merge(???, ???, by='issue') ``` ```{r} percent_of_joy_plot <- ??? %>% mutate(per_cent_joy=???/???*100) %>% ggplot(aes(x = ???, y = ???) )+ geom_col() + labs(x = "Date", y = "Joy words [%]", title = "Joyfulness about EU in 2000-2010") percent_of_joy_plot ``` ## Computing total joy fraction ```{r} distinct_words <- ??? %>% distinct(???) ``` ```{r} total_dis_words <- ???(???) ``` ```{r} total_dis_joy_words <- ???(inner_join(???,???,by='word')) ``` ```{r} total_joy <- (???/???)*100 print(???) ``` # Session 3: Analyzing word and document frequency: tf-idf ## Computing and displaying tf-idf ```{r, warning=FALSE} issue_words <- data_df %>% unnest_tokens(word, content) %>% count(issue, word, sort = TRUE) issue_words <- na.omit(issue_words) ``` ```{r} issue_tf_idf <- ??? %>% bind_tf_idf(word, issue, n) issue_tf_idf %>% arrange(desc(???)) ``` ```{r} ??? ``` ```{r} issue_tf_idf %>% semi_join(???, by="issue") %>% group_by(???) %>% slice_max(tf_idf, n = 10) %>% ungroup() %>% ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = issue)) + geom_col(show.legend = FALSE) + facet_wrap(~issue, scales="free",ncol = 2) + labs(x = "tf-idf", y = NULL) ``` ## BONUS CODE & EXPLANATION: Computing term frequency ```{r} total_words <- issue_words %>% group_by(???) %>% summarize(total = sum(n)) issue_total_words <- left_join(???, ???) %>% arrange(desc(???)) ``` ```{r, echo=FALSE, warning=FALSE} unique_issues <- issue_total_words %>% filter(???) %>% distinct(???) first_6_unique_issues <- ??? %>% slice(1:6) ``` ```{r, echo=FALSE, warning=FALSE} issue_total_words6 <- ??? %>% semi_join(???, by="issue") %>% mutate(???=as.character(???)) freq_per_issue_plot <- issue_total_words6 %>% ggplot(aes(n/total, fill = issue)) + geom_histogram(show.legend = FALSE) + xlim(NA, 0.0005) + facet_wrap(~issue, ncol = 2, scales = "free_y") freq_per_issue_plot ``` # Session 4: Relationships Between Words ## Tokenization with biagrams ```{r} tidy_content_rel <- ??? %>% unnest_tokens(bigram, ???, token= ???, ???) ``` ```{r} ??? %>% count(bigram, sort = TRUE) %>% filter(???) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(???, ???)) + geom_col() + labs(y = NULL) ``` ## Cleaning up biagrams ```{r} bigrams_separated <- ??? %>% separate(bigram, c(???, ???), sep = " ") ``` ```{r} bigrams_filtered <- ??? %>% filter(???) %>% filter(???) ``` ```{r} tidy_content_rel_clean <- ??? %>% unite(???, ???, ???, sep = " ") ``` ## Plotting biagrams ```{r} tidy_content_rel_clean %>% count(bigram, sort = TRUE) %>% filter(???) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(n, bigram)) + geom_col() + labs(y = NULL) ``` ## Relation between words ```{r} ??? %>% filter(???) %>% count(issue, ???, sort=TRUE) ``` ## BONUS: tf-idf for biagrams ```{r} bigram_tf_idf <- tidy_content_rel_clean %>% count(issue, bigram) %>% bind_tf_idf(???, ???, ???) %>% arrange(???) bigram_tf_idf ```