#########################################
### Package installation
#########################################
# The "tm" package enables the text mining infrastructure that we will use for LDA.
require("tm")
require("topicmodels")
require("stm")


#########################################
### Get the data, turn into a corpus, and clean it up
#########################################
# Load data from a URL
data = read.csv(url("https://github.com/IDeaS-repo/IDeaS-repo.github.io/blob/master/files/OMT_2015-2020.csv?raw=true"))

# Create a corpus. 
corpus = VCorpus((VectorSource(data[, "title_ab"])))

# Basic cleaning (step-wise)
# We write everything to a new corpus called "corpusclean" so that we do not lose the original data.
# 1) Remove numbers
corpusclean = tm_map(corpus, removeNumbers)
# 2) Remove punctuation
corpusclean = tm_map(corpusclean, removePunctuation)
# 3) Transform all upper-case letters to lower-case.
corpusclean = tm_map(corpusclean,  content_transformer(tolower))
# 4) Remove stopwords which do not convey any meaning.
corpusclean = tm_map(corpusclean, removeWords, stopwords("english")) 
# this stopword file is at C:\Users\[username]\Documents\R\win-library\[rversion]\tm\stopwords 

# i	me	my	myself	we	our	ours	ourselves	you	your	yours	yourself	yourselves	he	him	his	himself	
# she	her	hers	herself	it	its	itself	they	them	their	theirs	themselves	what	which	who	whom	this
# that	these	those	am	is	are	was	were	be	been	being	have	has	had	having	do	does	did	doing	would	should
# could	ought	i'm	you're	he's	she's	it's	we're	they're	i've	you've	we've	they've	i'd	you'd	he'd	she'd	we'd
# they'd	i'll	you'll	he'll	she'll	we'll	they'll	isn't	aren't	wasn't	weren't	hasn't	haven't	hadn't	doesn't	
# don't	didn't	won't	wouldn't	shan't	shouldn't	can't	cannot	couldn't	mustn't	let's	that's	who's	what's	here's
# there's	when's	where's	why's	how's	a	an	the	and	but	if	or	because	as	until	while	of	at	by	for	with	about	
# against	between	into	through	during	before	after	above	below	to	from	up	down	in	out	on	off	over	under	again
# further	then	once	here	there	when	where	why	how	all	any	both	each	few	more	most	other	some	such	no	nor	
# not	only	own	same	so	than	too	very

# 5) And strip whitespace. 
corpusclean = tm_map(corpusclean , stripWhitespace)

# We then convert the corpus to a "Document-term-matrix" (dtm)
dtm =DocumentTermMatrix(corpusclean)  

#########################################
### LDA: Running the model
#########################################
# We first fix the random seed for future replication.
SEED = 123456789

stmdata <- readCorpus(dtm, type = c("slam"))
stmdata$meta <- data

set.seed(123456789)
# Find number of topics:
# 
kResult <- searchK(stmdata$documents, stmdata$vocab, K=c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75), prevalence =~ s(year),
                   data=stmdata$meta, init.type = "Spectral", heldout.seed = 123456789,
                   max.em.its = 150, verbose = TRUE, control = list())
plot(kResult)
kResult
# Let's work with 45. 

topicnr = 45

stmmodel <- stm(stmdata$documents, stmdata$vocab, topicnr,prevalence =~ s(year),
                  data = stmdata$meta,
                  init.type = "Spectral", seed = 123456789,
                  max.em.its = 75, verbose = TRUE, reportevery = 5,
                  control = list())

labels <- labelTopics(stmmodel, n = 5)
labels


prep <- estimateEffect(1:45 ~ s(year),
                       stmmodel, meta=stmdata$meta, 
                       uncertainty="Global")

summary(prep)
plot(prep, "year", method = "continuous", topics = 13,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 13: failure, umbrella, constructs, resilience, hubris")
plot(prep, "year", method = "continuous", topics = 14,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 14: occupational, community, occupations, communities, occupation")
plot(prep, "year", method = "continuous", topics = 16,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 16: stigma, stigmatized, stigmatization, scandal, transgression")
plot(prep, "year", method = "continuous", topics = 21,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 21: collective, process, sensemaking, narratives, framing")
plot(prep, "year", method = "continuous", topics = 26,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 26: impression, frameworks, symbolic, stakeholder, managerial")
plot(prep, "year", method = "continuous", topics = 32,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 32: identity, organizational, identities, organizations, work")
plot(prep, "year", method = "continuous", topics = 36,
     model = stmmodel, printlegend = FALSE, xlab = "Year",ci.level = 0, ylim = c(0.00, 0.05), main="Topic 36: leadership, study, research, responsible, leader")
# These were all identified via manual inspection.