--- title: "" output: html_document: toc: true --- ```{r message=FALSE, warning=FALSE, echo=FALSE} # This is a code block library(readr) # CSV file I/O, e.g. the read_csv function library(tidyr) library(ggplot2) # Data visualization library(viridis) library(RColorBrewer) library(lubridate) library(tweenr) library(gganimate) library(ggthemes) library(dplyr) #text library(stringr) library(spacyr) library(tidytext) library(topicmodels) spacy_initialize(python_executable = "/Users/hannah/anaconda/bin/python") ``` ```{r input, message=FALSE} #https://www.kaggle.com/benrudolph/unhcr-speeches df<- read_csv("~/git_repo/opendata_viz/refugee/refugee.csv") df = df%>% mutate(content_cleaned = gsub("\\\\n","",df$content)) %>% separate(by, c("speaker", "dt"), sep=",") %>% mutate(dt = dmy(dt), yr = year(dt)) %>% filter(!is.na(content_cleaned)) glimpse(df) ``` ```{r preprocessing} parsedtxt <- spacy_parse(df$content_cleaned) entity <- entity_extract(parsedtxt, type = "all") ``` ```{r chart} #table(entity$entity_type) # CARDINAL DATE EVENT GPE LAW LOC NORP ORDINAL ORG PERCENT PERSON # 16872 17575 2109 28120 703 7733 25308 2109 16872 1406 5624 # TIME # 703 lookup = data.frame(entity_type = c('GPE','EVENT','LOC','NORP','ORG'), entity_name = c('STATES','EVENT','REGIONS','NATIONS/RELIGIONS','ORGANIZATIONS')) top_entity <- entity %>% filter(!entity %in% c('Mesdames et Messieurs','9-year','States')) %>% filter(entity_type %in% c('GPE','LOC','NORP','ORG')) %>% left_join(lookup, by = 'entity_type') %>% group_by(entity, entity_name) %>% summarize(n=n()) %>% ungroup() %>% arrange(desc(n)) %>% group_by(entity_name) %>% slice(1:20) top_entity%>% ggplot(aes(reorder(entity, n), n)) + geom_bar(stat='identity', aes(fill=entity_name), alpha=0.9) + facet_wrap(~entity_name,scales = "free") + coord_flip() + theme_minimal() + theme(axis.title.y = element_blank(), axis.title.x = element_blank(), plot.title = element_text(face="bold", size=16, hjust=0.5), text = element_text(family = "Helvetica"), legend.position = 'None', plot.margin=unit(c(0,1,1,1),"cm")) + labs(title = "Top mentions in UN Refugee Agency's Speeches", subtitle = "Institutes and places mentioned during 1951-2014") ``` ```{r ts} df$row_num=seq.int(nrow(df)) speech_dt <- df %>% mutate(doc_id = paste0('text', row_num)) %>% select(doc_id, dt, yr) entity_ts = entity %>% left_join(speech_dt, by='doc_id') %>% group_by(doc_id) %>% mutate(total_entity = n()) %>% ungroup() %>% filter(entity %in% c('Bosnia', 'Yugoslavia','Rwanda', 'Afghnistan', 'Somalia', 'Kosovo', 'Sudan', 'Burundi', 'Iraq', 'Mozambique', 'Cambodia','Angola','Ethiopia', 'Syria')) %>% group_by(yr, entity) %>% summarize(n=n()) entity_ts%>% ggplot(aes(yr, n, col=entity)) + geom_area(aes(fill=entity, alpha=0.5)) + facet_grid(entity~.) + theme_minimal() + theme(axis.title.y = element_blank(),axis.title.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.title = element_text(face="bold", size=16, hjust=0.5), legend.position='None', strip.text.y = element_text(angle=360), text = element_text(family = "Helvetica")) + ggtitle('Mentions of refugee countries 1951-2014') + guides(alpha=FALSE) + scale_x_continuous(breaks = seq(1950,2010,10)) ``` ```{r topicmodel} library(tm) clean_corpus <- function(corpus){ corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeNumbers) corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp","elonmusk")) return(corpus) } Corpus <- Corpus(VectorSource(df$content_cleaned)) myCorpus <- clean_corpus(Corpus) dtm <- DocumentTermMatrix(myCorpus, control = list(wordLengths = c(2, Inf))) ``` ```{r} lda <- LDA(dtm, k = 5, control = list(seed = 1234)) topics <- tidy(lda, matrix = "beta") topics ``` #Using cleanNLP ```{r} content = df$content_cleaned write.csv(content, 'content.csv') ``` ```{r} library(cleanNLP) init_spaCy(model_name = "en") anno <- run_annotators('content.csv') nlp <- get_combine(anno) ``` ```{r} library(magrittr) word_action <- nlp %>% filter(relation == "dobj") %>% left_join(word_frequency, by = "word") %>% filter(frequency < 0.001) %>% select(id, word_source, word) ``` ```{r} word_action_smry <- word_action %>% group_by(word_source, word) %>% summarize(n=n()) ``` ```{r} library(networkD3) simpleNetwork(word_action_smry[word_action_smry$n>20,]) ```