---
title: ""
output: 
    html_document: 
      toc: true
---

```{r message=FALSE, warning=FALSE, echo=FALSE}
# This is a code block
library(readr) # CSV file I/O, e.g. the read_csv function
library(tidyr)
library(ggplot2) # Data visualization
library(viridis)
library(RColorBrewer)
library(lubridate)
library(tweenr)
library(gganimate)
library(ggthemes)
library(dplyr)
#text
library(stringr)
library(spacyr)
library(tidytext)
library(topicmodels)
spacy_initialize(python_executable = "/Users/hannah/anaconda/bin/python")
```

```{r input, message=FALSE}
#https://www.kaggle.com/benrudolph/unhcr-speeches
df<- read_csv("~/git_repo/opendata_viz/refugee/refugee.csv") 
df = df%>%
  mutate(content_cleaned = gsub("\\\\n","",df$content)) %>%
  separate(by, c("speaker", "dt"), sep=",") %>%
  mutate(dt = dmy(dt), yr = year(dt)) %>%
  filter(!is.na(content_cleaned))
glimpse(df)
```

```{r preprocessing}
parsedtxt <- spacy_parse(df$content_cleaned)
entity <- entity_extract(parsedtxt, type = "all")
```

```{r chart}
#table(entity$entity_type)
# CARDINAL     DATE    EVENT      GPE      LAW      LOC     NORP  ORDINAL      ORG  PERCENT   PERSON 
#    16872    17575     2109    28120      703     7733    25308     2109    16872     1406     5624 
#     TIME 
#      703 
lookup = data.frame(entity_type = c('GPE','EVENT','LOC','NORP','ORG'),
                    entity_name = c('STATES','EVENT','REGIONS','NATIONS/RELIGIONS','ORGANIZATIONS'))  
top_entity <- entity %>%
  filter(!entity %in% c('Mesdames et Messieurs','9-year','States')) %>%
  filter(entity_type %in% c('GPE','LOC','NORP','ORG')) %>%
  left_join(lookup, by = 'entity_type') %>%
  group_by(entity, entity_name) %>%
  summarize(n=n()) %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  group_by(entity_name) %>%
  slice(1:20)
top_entity%>%
  ggplot(aes(reorder(entity, n), n)) + 
  geom_bar(stat='identity', aes(fill=entity_name), alpha=0.9) + 
  facet_wrap(~entity_name,scales = "free") +
  coord_flip() + 
  theme_minimal() +
  theme(axis.title.y = element_blank(), axis.title.x = element_blank(),
  plot.title = element_text(face="bold", size=16, hjust=0.5),
  text = element_text(family = "Helvetica"),
  legend.position = 'None',
  plot.margin=unit(c(0,1,1,1),"cm")) +
  labs(title = "Top mentions in UN Refugee Agency's Speeches",
  subtitle = "Institutes and places mentioned during 1951-2014") 

```
```{r ts}
df$row_num=seq.int(nrow(df))
speech_dt <- df %>% 
  mutate(doc_id = paste0('text', row_num)) %>%
  select(doc_id, dt, yr)

entity_ts = entity %>%
  left_join(speech_dt, by='doc_id') %>% 
  group_by(doc_id) %>%
  mutate(total_entity = n()) %>%
  ungroup() %>%
  filter(entity %in% c('Bosnia', 'Yugoslavia','Rwanda',
                       'Afghnistan', 'Somalia', 'Kosovo', 
                       'Sudan', 'Burundi', 'Iraq', 'Mozambique',
                       'Cambodia','Angola','Ethiopia',
                       'Syria')) %>%
  group_by(yr, entity) %>%
  summarize(n=n())

entity_ts%>%
  ggplot(aes(yr, n, col=entity)) + 
  geom_area(aes(fill=entity, alpha=0.5)) +
  facet_grid(entity~.) + 
  theme_minimal() +
  theme(axis.title.y = element_blank(),axis.title.x = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
  plot.title = element_text(face="bold", size=16, hjust=0.5),
  legend.position='None',
  strip.text.y = element_text(angle=360),
  text = element_text(family = "Helvetica")) +
  ggtitle('Mentions of refugee countries 1951-2014') +
  guides(alpha=FALSE)  + scale_x_continuous(breaks = seq(1950,2010,10)) 

```


```{r topicmodel}
library(tm)

clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp","elonmusk"))
return(corpus)
}
Corpus <- Corpus(VectorSource(df$content_cleaned))
myCorpus <- clean_corpus(Corpus)

dtm <- DocumentTermMatrix(myCorpus,
control = list(wordLengths = c(2, Inf)))

```

```{r}
lda <- LDA(dtm, k = 5, control = list(seed = 1234))
topics <- tidy(lda, matrix = "beta")
topics
```

#Using cleanNLP
```{r}
content = df$content_cleaned
write.csv(content, 'content.csv')
```

```{r}
library(cleanNLP)
init_spaCy(model_name = "en")
anno <- run_annotators('content.csv')
nlp <- get_combine(anno)
```

```{r}
library(magrittr)
word_action <- nlp %>%
  filter(relation == "dobj") %>%
  left_join(word_frequency, by = "word") %>%
  filter(frequency < 0.001) %>%
  select(id, word_source, word) 
```
```{r}
word_action_smry <- word_action %>%
  group_by(word_source, word) %>%
  summarize(n=n())
```

```{r}
library(networkD3)
simpleNetwork(word_action_smry[word_action_smry$n>20,])
```