---
title: ""
output:
html_document:
toc: true
---
```{r message=FALSE, warning=FALSE, echo=FALSE}
# This is a code block
library(readr) # CSV file I/O, e.g. the read_csv function
library(tidyr)
library(ggplot2) # Data visualization
library(viridis)
library(RColorBrewer)
library(lubridate)
library(tweenr)
library(gganimate)
library(ggthemes)
library(dplyr)
#text
library(stringr)
library(spacyr)
library(tidytext)
library(topicmodels)
spacy_initialize(python_executable = "/Users/hannah/anaconda/bin/python")
```
```{r input, message=FALSE}
#https://www.kaggle.com/benrudolph/unhcr-speeches
df<- read_csv("~/git_repo/opendata_viz/refugee/refugee.csv")
df = df%>%
mutate(content_cleaned = gsub("\\\\n","",df$content)) %>%
separate(by, c("speaker", "dt"), sep=",") %>%
mutate(dt = dmy(dt), yr = year(dt)) %>%
filter(!is.na(content_cleaned))
glimpse(df)
```
```{r preprocessing}
parsedtxt <- spacy_parse(df$content_cleaned)
entity <- entity_extract(parsedtxt, type = "all")
```
```{r chart}
#table(entity$entity_type)
# CARDINAL DATE EVENT GPE LAW LOC NORP ORDINAL ORG PERCENT PERSON
# 16872 17575 2109 28120 703 7733 25308 2109 16872 1406 5624
# TIME
# 703
lookup = data.frame(entity_type = c('GPE','EVENT','LOC','NORP','ORG'),
entity_name = c('STATES','EVENT','REGIONS','NATIONS/RELIGIONS','ORGANIZATIONS'))
top_entity <- entity %>%
filter(!entity %in% c('Mesdames et Messieurs','9-year','States')) %>%
filter(entity_type %in% c('GPE','LOC','NORP','ORG')) %>%
left_join(lookup, by = 'entity_type') %>%
group_by(entity, entity_name) %>%
summarize(n=n()) %>%
ungroup() %>%
arrange(desc(n)) %>%
group_by(entity_name) %>%
slice(1:20)
top_entity%>%
ggplot(aes(reorder(entity, n), n)) +
geom_bar(stat='identity', aes(fill=entity_name), alpha=0.9) +
facet_wrap(~entity_name,scales = "free") +
coord_flip() +
theme_minimal() +
theme(axis.title.y = element_blank(), axis.title.x = element_blank(),
plot.title = element_text(face="bold", size=16, hjust=0.5),
text = element_text(family = "Helvetica"),
legend.position = 'None',
plot.margin=unit(c(0,1,1,1),"cm")) +
labs(title = "Top mentions in UN Refugee Agency's Speeches",
subtitle = "Institutes and places mentioned during 1951-2014")
```
```{r ts}
df$row_num=seq.int(nrow(df))
speech_dt <- df %>%
mutate(doc_id = paste0('text', row_num)) %>%
select(doc_id, dt, yr)
entity_ts = entity %>%
left_join(speech_dt, by='doc_id') %>%
group_by(doc_id) %>%
mutate(total_entity = n()) %>%
ungroup() %>%
filter(entity %in% c('Bosnia', 'Yugoslavia','Rwanda',
'Afghnistan', 'Somalia', 'Kosovo',
'Sudan', 'Burundi', 'Iraq', 'Mozambique',
'Cambodia','Angola','Ethiopia',
'Syria')) %>%
group_by(yr, entity) %>%
summarize(n=n())
entity_ts%>%
ggplot(aes(yr, n, col=entity)) +
geom_area(aes(fill=entity, alpha=0.5)) +
facet_grid(entity~.) +
theme_minimal() +
theme(axis.title.y = element_blank(),axis.title.x = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
plot.title = element_text(face="bold", size=16, hjust=0.5),
legend.position='None',
strip.text.y = element_text(angle=360),
text = element_text(family = "Helvetica")) +
ggtitle('Mentions of refugee countries 1951-2014') +
guides(alpha=FALSE) + scale_x_continuous(breaks = seq(1950,2010,10))
```
```{r topicmodel}
library(tm)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp","elonmusk"))
return(corpus)
}
Corpus <- Corpus(VectorSource(df$content_cleaned))
myCorpus <- clean_corpus(Corpus)
dtm <- DocumentTermMatrix(myCorpus,
control = list(wordLengths = c(2, Inf)))
```
```{r}
lda <- LDA(dtm, k = 5, control = list(seed = 1234))
topics <- tidy(lda, matrix = "beta")
topics
```
#Using cleanNLP
```{r}
content = df$content_cleaned
write.csv(content, 'content.csv')
```
```{r}
library(cleanNLP)
init_spaCy(model_name = "en")
anno <- run_annotators('content.csv')
nlp <- get_combine(anno)
```
```{r}
library(magrittr)
word_action <- nlp %>%
filter(relation == "dobj") %>%
left_join(word_frequency, by = "word") %>%
filter(frequency < 0.001) %>%
select(id, word_source, word)
```
```{r}
word_action_smry <- word_action %>%
group_by(word_source, word) %>%
summarize(n=n())
```
```{r}
library(networkD3)
simpleNetwork(word_action_smry[word_action_smry$n>20,])
```