######################################################################################################################################## ############################## Analysing Authorship of PM Nawaz Sharif's Speeches using R ############################################## ######################################################################################################################################## ################ Purpose: This script cleans and analyses the speech texts extracted previously ################ Author: Ali Arsalan Kazmi ### Stage 1: Load packages and directory paths pkgs <- c("purrr", "data.table", "dplyr", "tidyr", "readr", "lubridate", "stylo", "tsne", "ggplot2", "igraph", "intergraph", "ggnetwork", "stringr", "ggrepel", "ggiraph", "ggthemes", "viridis", "scales", "grid", "gtable", "FactoMineR", "tidytext", "wordcloud", "extrafont", "textcat", "Hmisc", "openNLP", "quanteda", "ggfortify", "gplots", "fpc", "dendsort", "seriation") allLoaded <- sapply(pkgs, require, character.only = TRUE) basePath <- "C:\\Users\\kazami\\Desktop\\Aimia\\Campaign Analytics + BI + Self-learning\\Personal Projects\\PM Nawaz Speech Authorship" dataPath <- paste(basePath, "Data", sep = "/") setwd(dataPath) ### Stage 2: Load data allFilesPath <- list.files(pattern = "txt$", full.names = TRUE) speechList <- map(.x = allFilesPath, .f = read_file) allFilesName <- list.files(pattern = "txt$") pronounList <- stylo.pronouns("English") %>% gsub("^", "\\\\b", .) %>% gsub("$", "\\\\b", .) %>% paste0(., collapse = "|") ### Stage 3: Tidy data and identify language of speech (currently we only want to focus on English speeches) speechListClean <- speechList %>% map(~ gsub("^\\s+|\\s+$", "", .x)) %>% # Removing leading & trailing whitespace map(~ gsub("\\t|\\r|\\n", " ", .x)) %>% # Removing tab/return/newline spaces map(~ gsub("Mr[.]*\\s*President", "", .x, ignore.case = TRUE)) %>% # Removing unrequired addresses map(~ gsub("Ladies and Gentlemen", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Distinguished Guests*", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Excellency|Excellencies", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Quaid\\s*[eE]\\s*Azam", "quaideazam", .x, ignore.case = TRUE)) %>% map(~ gsub("Mr\\.", "Mr", .x, ignore.case = TRUE)) %>% map(~ gsub("I* Thank you", "", .x, ignore.case = TRUE)) %>% map(~ gsub("\\.", " fullstop ", .x)) %>% # Textualising a ., to later convert it back map(~ gsub(",", " comma ", .x)) %>% # Textualising a , to later convert it back map(~ gsub(";", " semicolon ", .x)) %>% # Textualising a ; map(~ gsub(":", " colon ", .x)) %>% # Textualising a : map(~ gsub("[^[:alnum:]+]", " ", .x)) %>% # Remove characters apart from alpha-numeric map(~ gsub("[[:digit:]]+", "", .x)) %>% # Remove digits map(~ gsub(pronounList, "", .x, ignore.case = TRUE)) %>% # Should probably remove some/all pronouns map(~ gsub(" fullstop ", " \\. ", .x)) %>% # Adding the . back in map(~ gsub(" comma ", " , ", .x)) %>% # Adding the , back in map(~ gsub(" semicolon ", " ; ", .x)) %>% # Adding the ; back in map(~ gsub(" colon ", " : ", .x)) %>% # Adding the : back in map(~ gsub("\\s+", " ", .x)) %>% # Remove extra whitespace map(~ gsub("^\\s+|\\s+$", "", .x)) %>% # Removing leading & trailing whitespace again map(~ tolower(.x)) # Making text lowercase names(speechListClean) <- allFilesName speechLang <- speechListClean %>% map(~ substr(.x, start = 1, stop = 300)) %>% # Only using first 300 characters map(~ textcat(.x)) %>% # Identifying the language of each speech map_chr(1) # Extracting the first object from each item of list speechListCleanEn <- speechListClean[speechLang == "english"] # The following speeches are actually reports from the PM's office notSpeeches <- c("20150106-21st Constitutional Amendment amendment in army act to prove vital in establishing peace harmony.txt", "20150108-PMAddress to Pakistani community in Bahrain.txt") speechListCleanEn <- speechListCleanEn[!names(speechListCleanEn) %in% notSpeeches] ### Stage 4: Basic reporting - total word counts, sentence counts, character counts g1 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% group_by(dateOfSpeech) %>% summarise(`Unique Words` = length(unique(word)), `Total Words` = length(word)) %>% mutate(prcntWords = `Unique Words`/`Total Words`) %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech, ordered = TRUE), y = prcntWords)) + geom_bar(stat = "identity", fill = "#A8CD1B") + scale_y_continuous(label = percent, breaks = pretty_breaks(7)) + xlab("Speech Date") + ylab("") + ggtitle("What Percentage of Words are Unique in the Prime Minister's Speeches?") + geom_hline(yintercept = .4, linetype = "dashed", colour = "white") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g1.png", sep = "/"), plot = g1, height = 10, width = 15, units='in', dpi=600) g2 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% group_by(dateOfSpeech) %>% summarise(uniqueWords = length(unique(word)), totalWords = length(word)) %>% ungroup() %>% arrange(dateOfSpeech) %>% ggplot(aes(x = uniqueWords, y = totalWords)) + geom_point(colour = "#A8CD1B", size = 3) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = comma) + scale_x_continuous(label = comma) + xlab("Number of Unique Words") + ylab("Number of Total Words") + ggtitle("Does the Number of Unique Words in Speeches increase with an increase in Speech length?") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g2.png", sep = "/"), plot = g2, height = 10, width = 15, units='in', dpi=600) g3 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% group_by(dateOfSpeech) %>% summarise(`Unique Words` = length(unique(word)), `Total Words` = length(word)) %>% ungroup() %>% gather(variable, value, -dateOfSpeech) %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech, ordered = TRUE), y = value, fill = variable)) + geom_bar(stat = "identity", position = "dodge") + scale_fill_manual(values = c("#005A31", "#A8CD1B"), name = "") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many Words does the Prime Minister use in his Speeches?\n") + theme_bw() + theme(legend.position = c(0.09,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g3.png", sep = "/"), plot = g3, height = 10, width = 15, units='in', dpi=600) g4 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% group_by(dateOfSpeech) %>% summarise(totalSentences = length(sentence)) %>% ggplot(aes(x = factor(dateOfSpeech), y = totalSentences)) + geom_bar(stat = "identity", fill = "#A8CD1B") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many Sentences does the Prime Minister has in his Speeches?") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g4.png", sep = "/"), plot = g4, height = 10, width = 15, units='in', dpi=600) g5 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% mutate(totalCharSentence = str_count(sentence, "\\S+")) %>% group_by(dateOfSpeech) %>% summarise(`Median Words in a Sentence` = median(totalCharSentence), `Mean Words in a Sentence` = mean(totalCharSentence)) %>% gather(variable, value, -dateOfSpeech) %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech), y = value, group = variable, colour = variable)) + geom_line() + scale_colour_manual(values = c("#005A31", "#A8CD1B"), name = "") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many Words comprise a Sentence in the Prime Minister's Speeches?\n") + theme_bw() + theme(legend.position = c(0.165,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g5.png", sep = "/"), plot = g5, height = 10, width = 15, units='in', dpi=600) g6 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% group_by(word) %>% summarise(wordFreq = length(word)) %>% ungroup() %>% arrange(-wordFreq) %>% mutate(wordRank = 1:length(word)) %>% ggplot(aes(x = wordRank, y = wordFreq)) + geom_point(colour = "#A8CD1B") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + scale_x_continuous(label = comma, breaks = pretty_breaks(10)) + xlab("Rank of Each Word") + ylab("Frequency of Each Word") + ggtitle("Rank Frequency Plot for the Collection of the Prime Minister's Speeches") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g6.png", sep = "/"), plot = g6, height = 10, width = 15, units='in', dpi=600) g7 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% mutate(commasInSentence = nchar(gsub("[^,]", "", sentence))) %>% group_by(dateOfSpeech) %>% summarise(commasPerSentence = mean(commasInSentence)) %>% ungroup() %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech), y = commasPerSentence)) + geom_bar(stat = "identity", fill = "#A8CD1B") + geom_hline(yintercept = 1, linetype = "dashed", colour = "white") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many commas does the Prime Miniser use in his Sentences on Average?") + theme_bw() + theme(legend.position = c(0.2,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g7.png", sep = "/"), plot = g7, height = 10, width = 15, units='in', dpi=600) g8 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% mutate(colonsInSentence = nchar(gsub("[^:;]", "", sentence))) %>% group_by(dateOfSpeech) %>% summarise(colonsPerSentence = mean(colonsInSentence)) %>% ungroup() %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech), y = colonsPerSentence)) + geom_bar(stat = "identity", fill = "#A8CD1B") + #geom_hline(yintercept = 1, linetype = "dashed", colour = "white") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many colons/semi-colons does the Prime Miniser use in his Sentences on Average?") + theme_bw() + theme(legend.position = c(0.2,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g8.png", sep = "/"), plot = g8, height = 10, width = 15, units='in', dpi=600) g9 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% mutate(colonsInSentence = nchar(gsub("[^:;]", "", sentence))) %>% group_by(dateOfSpeech) %>% summarise(colonsInSpeech = sum(colonsInSentence)) %>% ungroup() %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech), y = colonsInSpeech)) + geom_bar(stat = "identity", fill = "#A8CD1B") + #geom_hline(yintercept = 1, linetype = "dashed", colour = "white") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("How many colons/semi-colons does the Prime Miniser use in his Speeches?") + theme_bw() + theme(legend.position = c(0.2,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g9.png", sep = "/"), plot = g9, height = 10, width = 15, units='in', dpi=600) g10 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% mutate(wordLength = nchar(word)) %>% group_by(dateOfSpeech) %>% summarise(`Median Word Length` = median(wordLength), `Mean Word Length` = mean(wordLength)) %>% gather(variable, value, -dateOfSpeech) %>% arrange(dateOfSpeech) %>% ggplot(aes(x = factor(dateOfSpeech), y = value, group = variable, colour = variable)) + geom_line() + scale_colour_manual(values = c("#005A31", "#A8CD1B"), name = "") + scale_y_continuous(label = comma, breaks = pretty_breaks(5)) + xlab("Speech Date") + ylab("") + ggtitle("What is the Word Length on Average in the Prime Minister's Speeches?\n") + theme_bw() + theme(legend.position = c(0.165,1.027), legend.direction = "horizontal", legend.text = element_text(size = rel(1.1)), text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(angle = 90, hjust = 0, size = rel(1))) ggsave(paste(basePath, "Graphs", "nss_ba_g10.png", sep = "/"), plot = g10, height = 10, width = 15, units='in', dpi=600) rm(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10) gc(T,T) ### Stage 5: Preparing data sets for analyses # Sentence Level Measures: # 1. Sentence length # a. Avg no. of words in a sentence # b. Avg no. of characters in a sentence # 2. Punctuation # a. Avg no. of commas # b. Avg no. of semi colons and colons # 3. Type-Token ratio (different ones mentioned here: http://coltekin.net/cagri/courses/lingdiff/slides/grieve-referat.pdf) # Speech Level Measures # 1. Avg length of word # 2. No. of Punctuation marks by Characters in Speech slm_df1 <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = sentence, input = speech, token = "regex", pattern = "\\.") %>% filter(nchar(sentence) > 1) %>% # Some sentences consist only of a space mutate(totalWordSentence = str_count(sentence, "\\S+"), totalCharSentence = nchar(gsub("\\s+", "", sentence)), totalSColonSentence = nchar(gsub("[^:;]", "", sentence)), totalCommaSentence = nchar(gsub("[^,]", "", sentence)), totalPunctSentence = nchar(gsub("[^,:;]", "", sentence))) %>% # fullstops not included group_by(sentence) %>% mutate(ttRatio = length(unique(unlist(str_extract_all(sentence, "\\S+"))))/ length(unlist(str_extract_all(sentence, "\\S+")))) %>% group_by(name, dateOfSpeech) %>% summarise(avgWordsSentence = mean(totalWordSentence, na.rm = TRUE), avgCharSentence = mean(totalCharSentence, na.rm = TRUE), avgCommaSentence = mean(totalCommaSentence, na.rm = TRUE), avgSColonSentence = mean(totalSColonSentence, na.rm = TRUE), punctPerCharSpeech = sum(totalPunctSentence)/sum(totalCharSentence), avgLengthWordSpeech = sum(totalCharSentence)/sum(totalWordSentence), avgTTRatio = mean(ttRatio, na.rm = TRUE)) %>% ungroup() ggplot(data = slm_df1, aes(x = punctPerCharSpeech, y= avgLengthWordSpeech)) + geom_point(colour = "#A8CD1B", size = 2) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = comma) + scale_x_continuous(label = comma) + ylab("Averange Length of a Word in a Speech") + xlab("Punctuation Per Character") + ggtitle("Comparing how the Length of Words Changes with the Rate of Punctuation") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(size = rel(1))) ggplot(data = slm_df1, aes(x = punctPerCharSpeech, y= avgCharSentence)) + geom_point(colour = "#A8CD1B", size = 2) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = comma) + scale_x_continuous(label = comma) + ylab("Averange Number of Characters in a Sentence") + xlab("Punctuation Per Character") + ggtitle("Comparing how the Average Number of Characters in a Sentence Changes with the Rate of Punctuation") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(size = rel(1))) ggplot(data = slm_df1, aes(x = punctPerCharSpeech, y= avgTTRatio)) + geom_point(colour = "#A8CD1B", size = 2) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = percent) + scale_x_continuous(label = comma) + ylab("Percentage of Unique Words in a Sentence") + xlab("Punctuation Per Character") + ggtitle("Percentage of Unique Words per Sentence decreases with an increase in Punctuation Rate") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1)), axis.text.x = element_text(size = rel(1))) g11 <- ggplot(data = slm_df1, aes(x = avgWordsSentence, y= avgTTRatio)) + geom_point(colour = "#A8CD1B", size = 2) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = percent) + scale_x_continuous(label = comma) + ylab("Percentage of Unique Words in a Sentence") + xlab("Average Number of Words in a Sentence") + ggtitle("Percentage of Unique Words per Sentence decreases with an increase in Average No. of Words in a Sentence") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_ba_g11.png", sep = "/"), plot = g11, height = 8, width = 15, units='in', dpi=400) g12 <- ggplot(data = slm_df1, aes(x = avgLengthWordSpeech, y= avgTTRatio)) + geom_point(colour = "#A8CD1B", size = 2) + geom_smooth(colour = "#005A31") + scale_y_continuous(label = percent) + scale_x_continuous(label = comma) + ylab("Percentage of Unique Words in a Sentence") + xlab("Average Length of Words in a Speech") + ggtitle("Percentage of Unique Words per Sentence against Average Length of Words in a Speech") + theme_bw() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_ba_g12.png", sep = "/"), plot = g12, height = 8, width = 15, units='in', dpi=400) ####### Visualising the first data set cmd_df1 <- slm_df1 %>% select(avgWordsSentence, avgCharSentence, avgCommaSentence, avgSColonSentence, avgTTRatio) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist(method = "euclidean") %>% cmdscale(eig = TRUE) %>% autoplot pca_1 <- slm_df1 %>% select(-name) %>% mutate(speechYear = year(dateOfSpeech)) %>% data.frame %>% select(-dateOfSpeech) %>% PCA(, scale.unit = TRUE, ncp = 10, quali.sup = 8) plot(pca1, choix = "ind", col.quali = "#005A31", col.ind = "#A8CD1B", col.var = "black") sm_df2 <- slm_df1 %>% select(-name, -dateOfSpeech) %>% data.frame pca_2 <- sm_df2 %>% PCA(, scale.unit = TRUE, ncp = 20) hcpc1 <- pca_2 %>% HCPC ### Stage 6: POS Tagging # Function to extract POS tags extractPOS <- function(speech){ speechString <- as.String(speech) sent_token_annotator <- Maxent_Sent_Token_Annotator() word_token_annotator <- Maxent_Word_Token_Annotator() pos_tag_annotator <- Maxent_POS_Tag_Annotator() annotatedSpeech <- annotate(speechString, list(sent_token_annotator, word_token_annotator)) posAnnotatedSpeech <- annotate(speechString, pos_tag_annotator, annotatedSpeech) posTags <- posAnnotatedSpeech$features %>% map("POS") %>% unlist %>% paste(collapse = " ") return(posTags) } # Applying our function to PM's speeches - this takes approx. 15 minutes on my laptop speechPosTags <- speechListCleanEn %>% map(~ extractPOS(speech = .x)) speechPosTagsClean <- speechPosTags %>% map(~ gsub("[[:punct:]]+", "", .x)) %>% map(~ gsub("^\\s+|\\s+$", "", .x)) %>% map(~ gsub("\\s+", " ", .x)) #saveRDS(speechPosTagsClean, "speechPosTags.RDS") speechPosTagsClean <- readRDS("speechPosTags.RDS") ########## POS unigrams topPos <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags) %>% group_by(pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% head(15) %>% select(pos) %>% unlist %>% unname posToUse <- data.frame(pos = topPos, stringsAsFactors = FALSE) cmd1 <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags) %>% inner_join(posToUse) %>% group_by(name, pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% ungroup %>% spread(pos, posFreq, fill = 0) %>% select(-name) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale(eig = TRUE) %>% autoplot(colour = "#005A31", size = 3) + ggtitle("Multi-Dimensional Scaling with Cosine Delta on Top 15 POS Tags") + theme_minimal() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_posMds_g1.png", sep = "/"), plot = cmd1, height = 8, width = 15, units='in', dpi=400) t1 <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags) %>% inner_join(posToUse) %>% group_by(name, pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% ungroup %>% spread(pos, posFreq, fill = 0) %>% select(-name) %>% PCA ######### POS Bigrams topPosFreq <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% group_by(pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% head(50) %>% select(pos) %>% unlist %>% unname topPosDoc <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% group_by(pos) %>% summarise(appearInDocs = length(unique(name))) %>% arrange(-appearInDocs) %>% filter(appearInDocs/max(appearInDocs) >= .5) %>% select(pos) %>% unlist %>% unname %>% head(90) posToUse <- data.frame(pos = base::union(topPosFreq, topPosDoc), stringsAsFactors = FALSE) posBgDf <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% inner_join(posToUse) %>% group_by(name, pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% ungroup %>% spread(pos, posFreq, fill = 0) cmd2 <- posBgDf %>% select(-name) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale(eig = TRUE) %>% autoplot(colour = "#005A31", size = 3) + ggtitle("Multi-Dimensional Scaling with Cosine Delta on Top 90 POS Bigram Tags") + theme_minimal() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_posMds_g2.png", sep = "/"), plot = cmd2, height = 8, width = 15, units='in', dpi=400) ######### POS Bigrams Consensus Network topPosFreq <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% group_by(pos) %>% summarise(posFreq = n()) %>% arrange(-posFreq) %>% head(50) %>% select(pos) %>% unlist %>% unname topPosDoc <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% group_by(pos) %>% summarise(appearInDocs = length(unique(name))) %>% arrange(-appearInDocs) %>% filter(appearInDocs/max(appearInDocs) >= .5) %>% select(pos) %>% unlist %>% unname posToUse <- union(topPosFreq, topPosDoc) # I need 30 features for use in dissimilarity calculation for 1st batch, cumulatively summing up to 150 total features for the final batch featToUse <- 30 noOfBatch <- length(posToUse)/featToUse splitAt <- as.list(cumsum(rep(featToUse, times = noOfBatch))) # Function to split our data set into batches splitDfBatch <- function(toIncludeLen, posTags, df){ colsInd <- seq(from = 1, by = 1, to = toIncludeLen) colsToUse <- gsub("\\s+", "\\.", posTags[colsInd]) dfBatch <- df[, names(df) %in% colsToUse] return(dfBatch) } posBgDf <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean))), speechPosTags = unlist(speechPosTagsClean), stringsAsFactors = FALSE) %>% unnest_tokens(output = pos, input = speechPosTags, token = "ngrams", n = 2) %>% group_by(name, pos) %>% summarise(posFreq = n()) %>% ungroup %>% spread(pos, posFreq, fill = 0) %>% data.frame row.names(posBgDf) <- posBgDf$name posBgDfBatches <- map(.x = splitAt, .f = splitDfBatch, posTags = posToUse, df = posBgDf) %>% map(.x = ., .f = as.matrix) %>% map(.x = ., .f = scale, center = TRUE, scale = TRUE) %>% map(.x = ., .f = dist.cosine) %>% map(.x = ., .f = as.matrix) %>% map(.x = ., .f = melt) %>% map2(.x = ., .y = setNames(as.list(seq(from = 1, to = noOfBatch)), nm = rep("batch", times = noOfBatch)), .f = mutate) %>% map(.f = function(x) filter(x, Var1 != Var2)) %>% map(.f = function(x) group_by(.data = x, Var1)) %>% map(.f = function(x) top_n(x, n = -5, wt = value)) %>% rbindlist %>% rename(doc1 = Var1, doc2 = Var2, distance = value, batch = `.y[[i]]`) %>% arrange(doc1, batch, distance) %>% ungroup %>% setDT simDf <- posBgDfBatches[ ][, presentInBatch := length(unique(batch)), by = .(doc1, doc2) ][presentInBatch >= 3 ][, .(avgDistance = mean(distance)), by = .(doc1, doc2) ][, avgSimilarity := 1/avgDistance ][, .(doc1, doc2, avgSimilarity)] verticesDf <- data.frame(name = names(speechPosTagsClean), dateOfSpeech = as.character(ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechPosTagsClean)))), stringsAsFactors = FALSE) %>% mutate(speechCode = sprintf("%s - %d", "Speech", seq_along(name))) %>% setDT set.seed(1234) g1 <- graph_from_data_frame(d = simDf, directed = FALSE, vertices = verticesDf) g1 <- simplify(g1, edge.attr.comb="sum") wtc <- cluster_walktrap(g1) modularity(wtc); modularity(g1, membership(wtc)) #0.455 dfG1 <- data.frame(speeches = unclass(wtc)$names, groups = unclass(wtc)$membership, stringsAsFactors = FALSE) #this gives membership of each speech dfG2 <- inner_join(dfG1, verticesDf, by = c("speeches" = "name")) posNDf <- ggnetwork(g1, layout="fruchtermanreingold", arrow.gap=0, cell.jitter=0) %>% setDT posNDf[, dateOfSpeechInt := as.integer(posNDf$dateOfSpeech)] gcn_pos1 <- ggplot(data = posNDf) + geom_edges(aes(x = x, y = y, xend = xend, yend = yend, alpha = avgSimilarity), color = "grey50", curvature = 0.1, size = 1) + geom_nodes(aes(x = x, y = y, xend = xend, yend = yend, colour = dateOfSpeechInt), size = 5) + scale_color_distiller(palette = "Greens", direction = 1, guide = guide_colourbar(ticks = FALSE, barwidth = 10, barheight = .4, title = "Recency of Speech", title.position = "top", title.hjust = 1, title.vjust = 1, label.position = "top", direction = "horizontal", label = FALSE)) + geom_text_repel(data = unique(posNDf[, .(x, y, speechCode)]), aes(x = x, y = y, label = speechCode), colour = "grey50") + xlab("") + ylab("") + ggtitle("Consensus Network of the Prime Minister's Speeches") + theme(text = element_text(family = "Garamond"), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), plot.background = element_rect(fill = "white"), legend.background = element_blank(), legend.key = element_blank(), legend.position = "top", plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2), colour = "grey30")) + guides(alpha = guide_legend(title = "Stylistic Similarity", title.position = "top", title.hjust = 1, title.vjust = 1, ticks = FALSE, direction = "horizontal", label = FALSE, keywidth = 3.1, keyheight = 1)) ggplot_with_subtitle(gcn_pos1, label = paste("Network developed using 5 batches with 30, 60, 90, 120, 150 MF POS bigrams.", "Darker edges represent stronger connections, whereas darker nodes represent recent speeches.", "", sep = "\n"), fontfamily = "Calibri", cex = 1.3, col = "black") ggsave(paste(basePath, "Graphs", "bcn_pos2a.png", sep = "/"), height = 8, width = 15, units='in', dpi = 400) ######### Most Frequent Function Words Unigrams speechListClean <- speechList %>% map(~ gsub("^\\s+|\\s+$", "", .x)) %>% # Removing leading & trailing whitespace map(~ gsub("\\t|\\r|\\n", " ", .x)) %>% # Removing tab/return/newline spaces map(~ gsub("Mr[.]*\\s*President", "", .x, ignore.case = TRUE)) %>% # Removing unrequired addresses map(~ gsub("Ladies and Gentlemen", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Distinguished Guests*", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Excellency|Excellencies", "", .x, ignore.case = TRUE)) %>% map(~ gsub("Quaid\\s*[eE]\\s*Azam", "quaideazam", .x, ignore.case = TRUE)) %>% map(~ gsub("Mr\\.", "Mr", .x, ignore.case = TRUE)) %>% map(~ gsub("I* Thank you", "", .x, ignore.case = TRUE)) %>% map(~ gsub("\\.", " fullstop ", .x)) %>% # Textualising a ., to later convert it back map(~ gsub(",", " comma ", .x)) %>% # Textualising a , to later convert it back map(~ gsub(";", " semicolon ", .x)) %>% # Textualising a ; map(~ gsub(":", " colon ", .x)) %>% # Textualising a : map(~ gsub("[^[:alnum:]+]", " ", .x)) %>% # Remove characters apart from alpha-numeric map(~ gsub("[[:digit:]]+", "", .x)) %>% # Remove digits map(~ gsub(pronounList, "", .x, ignore.case = TRUE)) %>% # Should probably remove some/all pronouns map(~ gsub(" fullstop ", " \\. ", .x)) %>% # Adding the . back in map(~ gsub(" comma ", " , ", .x)) %>% # Adding the , back in map(~ gsub(" semicolon ", " ; ", .x)) %>% # Adding the ; back in map(~ gsub(" colon ", " : ", .x)) %>% # Adding the : back in map(~ gsub("[.,;:]", " ", .x)) %>% # Now removing punct signs for enabling analyses map(~ gsub("\\s+", " ", .x)) %>% # Remove extra whitespace map(~ gsub("^\\s+|\\s+$", "", .x)) %>% # Removing leading & trailing whitespace again map(~ tolower(.x)) # Making text lowercase names(speechListClean) <- allFilesName speechLang <- speechListClean %>% map(~ substr(.x, start = 1, stop = 300)) %>% # Only using first 300 characters map(~ textcat(.x)) %>% # Identifying the language of each speech map_chr(1) # Extracting the first object from each item of list speechListCleanEn <- speechListClean[speechLang == "english"] notSpeeches <- c("20150106-21st Constitutional Amendment amendment in army act to prove vital in establishing peace harmony.txt", "20150108-PMAddress to Pakistani community in Bahrain.txt") speechListCleanEn <- speechListCleanEn[!names(speechListCleanEn) %in% notSpeeches] # Following words were found by sorting words for # 1. Number of times they appear in all speeches # 2. Number of speeches in which they appear wordsToRemove <- data.frame(word = unique( c("mr", "pakistan", "s", "mws", "mw", "e", "th", "economic", "development", "peace", "security", "countries", "country", "energy", "asia", "one", "cooperation", "international", "afghanistan", "national", "region", "investment", "two", "growth", "efforts", "terrorism", "sector", "years", "power", "trade", "foreign", "nuclear", "regional", "south", "global", "education", "business", "year", "china", "policy", "opportunities", "process", "economy", "support", "time", "afghan", "infrastructure", "president", "project", "future", "prosperity", "relations", "united", "india", "political", "challenges", "human", "resources", "state", "social", "peaceful", "stability", "nations", "gas", "potential", "vision", "major", "minister", "prime", "progress", "strategic", "un", "women", "connectivity", "billion", "financial", "issues", "opportunity", "community", "million", "mutual", "private", "water", "mr", "mister", "training", "address", "building", "common", "help", "promoting", "society", "past", "council", "dialogue", "youth", "build", "environment", "asian", "balochistan", "baluchistan", "goals", "commitment", "public", "states", "agenda", "conference", "forces", "military", "promote", "visit", "confident", "congratulate", "defence", "comprehensive", "ladies", "services", "special", "strengthen", "summit", "turkish", "assembly", "companies", "general", "increased", "interest", "priority", "strategy", "action", "bilateral", "challenge", "corridor", "extremism", "friendly", "initiatives", "market", "modern", "natural", "navy", "stable", "success", "thank", "capital", "policies", "programme", "access", "areas", "assure", "create", "culture", "leadership", "maritime", "government", "world", "central", "technology", "welcome", "projects", "partnership", "history", "initiative", "relationship", "sustainable", "democratic", "poverty", "population", "implementation", "ties", "objectives", "civil", "institutions", "principles", "cultural", "collaboration", "friendship", "gdp", "historic", "priorities", "sectors", "socio", "partners", "allah", "deficit", "decades", "facilities", "five", "health", "non", "inrerests", "pakistani", "resource", "vibrant", "capacity", "casa", "distinguished", "elected", "faith", "exchange", "launched", "market", "markets", "member", "members", "port", "responsibility", "strengthened", "zindabad", "august", "collective", "completion", "democracy", "economies", "governments", "integration", "islamabad", "karachi", "law", "promise", "regime", "system", "consensus", "standards", "dear", "gentlement", "ladies", "governance", "framework", "industrial", "industry", "oil", "paindabad", "information", "pipeline", "reconciliation", "students", "tax", "tradition", "traditions", "transparent", "armed", "budget", "city", "conclusion", "domestic", "decision", "decisions", "institutional", "operation", "media", "menace", "investors", "internal", "terrorist", "terrorists", "west", "thousands", "staff", "turkmenistan", "turkey", "xi", "ventures", "transformation", "technological", "technologies", "territorial", "strategies", "speech", "resolution", "reserves", "sovereignty", "regions", "religious", "repatriation", "refugees", "performance", "partner", "pak", "iran", "indicators", "indicator", "gwadar", "globalization", "hosting", "geographical", "europe", "dollars", "discrimination", "disarmement", "deterrence", "commerce", "brother", "brothers", "sisters", "sister", "boundaries", "association", "attacks", "soldiers", "violence", "scourge", "solidatory", "threats", "pakistanis", "neighbourhood", "fiscal", "fact", "tajikistan", "peacekeeping", "young", "army", "kashmir", "chinese", "investments", "gentlemen", "islamic", "friends", "percent", "people", "peoples", "provincial", "play", "saarc", "government", "development", "people", "am", "nation", "today", "important", "new", "between", "will", "both", "continue", "forward", "role", "wish", "including", "come", "strong", "towards")), stringsAsFactors = FALSE) # mw appears in mega watts, s is a leftover once apostraphes are removed, th is a left once numbers are removed # e appears in quaid e azam # e, th, s, mw, # s, e = 20130814-Prime Minister Muhammad Nawaz Sharifs address on Independence Day of Pakistan.txt # mw = 20130918-Prime Minister Muhammad Nawaz Sharifs speech for the business forum.txt # th = 20130921-PMs youth policy address 21st September 2013.txt topWordsFreq <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% anti_join(wordsToRemove) %>% group_by(word) %>% summarise(wordFreq = n()) %>% arrange(-wordFreq) %>% head(20) %>% select(word) %>% unlist %>% unname topWordsSpeech <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% anti_join(wordsToRemove) %>% group_by(word) %>% summarise(wordPresentInSpeeches = length(unique(name))) %>% arrange(-wordPresentInSpeeches) %>% filter(wordPresentInSpeeches/max(wordPresentInSpeeches) >= .5) %>% select(word) %>% unlist %>% unname wordsToUse <- data.frame(word = union(topWordsFreq, topWordsSpeech), stringsAsFactors = FALSE) stylDf <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% inner_join(wordsToUse) %>% group_by(name, dateOfSpeech, word) %>% summarise(totalCounts = n()) %>% ungroup %>% spread(word, totalCounts, fill = 0) %>% data.frame row.names(stylDf) <- 1:nrow(stylDf) cmd_df1 <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale(eig = TRUE) %>% autoplot(colour = "#005A31", size = 3) + ggtitle("Multi-Dimensional Scaling with Cosine Delta on Top 53 Most Frequent Words Unigrams") + theme_minimal() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_mfwMds_g1.png", sep = "/"), plot = cmd_df1, height = 8, width = 15, units='in', dpi=400) hmap1 <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% heatmap.2(., distfun = dist.cosine, scale = "none", trace = "none", main = "Heatmap of PM's Speeches using 53 Most Frequent Word Unigrams", col = magma(n = 100), dendrogram = "row", denscol = "steelblue") datMat <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix row.names(datMat) <- stylDf$name distMat <- datMat %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine %>% sqrt pam1 <- clusterboot(data = distMat, B = 500, bootmethod = "boot", clustermethod = pamkCBI, seed = 14551, count = FALSE) dfG2 <- data.frame(speeches = attr(pam1$result$result$pamobject$clustering, which = "names"), groups = pam1$result$result$pamobject$clustering, stringsAsFactors = FALSE) ##### Word bigrams wordsToRemove <- data.frame(word = unique(c("of pakistan", "in pakistan", "the world", "pakistan s", "the people", "peace and", "the government", "pakistan is", "peace and", "the government", "pakistan is", "pakistan and", "people of", "the country", "like to", "the region", "two countries", "prime minister", "continue to", "pakistan has", "the united", "south asia", "the international", "would like", "economic development", "government is", "and economic", "and security", "and prosperity", "development of", "in afghanistan", "wish to", "economic growth", "that pakistan", "the past", "opportunity to", "number of", "committed to", "united nations", "central asia", "development and", "the first", "confident that", "efforts to", "private sector", "the nation", "the two", "and stability", "the entire", "the global", "the national", "the un", "a peaceful", "government has", "pakistan in", "pakistan will", "role in", "to promote", "and development", "based on", "cooperation in", "nuclear security", "the energy", "to address", "economic and", "of peace", "to pakistan", "a strong", "commitment to", "forward to", "government of", "international community", "relations with", "stability in", "am confident", "of terrorism", "pakistan navy", "security and", "the security", "will continue", "economic corridor", "for all", "for pakistan", "government and", "growth and", "opportunities for", "socio economic", "the afghan", "and pakistan", "and social", "implementation of", "ladies and", "of government", "trade and", "a comprehensive", "a number", "believe that", "in south", "pakistan the", "the last", "this opportunity", "am sure", "for peace", "security council", "terrorism and", "the economy", "the pakistan", "the right", "to provide", "between pakistan", "government to", "in fact", "region and", "the new", "will be", "has been")), stringsAsFactors = FALSE) topWordsFreq <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech, token= "ngrams", n = 2) %>% anti_join(wordsToRemove) %>% group_by(word) %>% summarise(wordFreq = n()) %>% arrange(-wordFreq) %>% head(30) %>% select(word) %>% unlist %>% unname topWordsSpeech <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech, token = "ngrams", n = 2) %>% anti_join(wordsToRemove) %>% group_by(word) %>% summarise(wordPresentInSpeeches = length(unique(name))) %>% arrange(-wordPresentInSpeeches) %>% filter(wordPresentInSpeeches/max(wordPresentInSpeeches) >= .4) %>% select(word) %>% unlist %>% unname wordsToUse <- data.frame(word = union(topWordsFreq, topWordsSpeech), stringsAsFactors = FALSE) stylDf <- data.frame(name = names(speechListCleanEn), dateOfSpeech = ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn))), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech, token = "ngrams", n = 2) %>% inner_join(wordsToUse) %>% group_by(name, dateOfSpeech, word) %>% summarise(totalCounts = n()) %>% ungroup %>% spread(word, totalCounts, fill = 0) %>% data.frame row.names(stylDf) <- 1:nrow(stylDf) cmd_df1 <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale(eig = TRUE) %>% autoplot(colour = "#005A31", size = 3) + ggtitle("Multi-Dimensional Scaling with Cosine Delta on Top 33 Most Frequent Words Bigrams") + theme_minimal() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_mfwMds_g2.png", sep = "/"), plot = cmd_df1, height = 8, width = 15, units='in', dpi=400) hmap2 <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% heatmap.2(., distfun = dist.cosine, scale = "none", trace = "none", main = "Heatmap of PM's Speeches using\n33 Most Frequent Word Bigrams", col = magma(n = 100), dendrogram = "row", denscol = "steelblue") datMat <- stylDf %>% select(-name, -dateOfSpeech) %>% as.matrix row.names(datMat) <- stylDf$name distMat <- datMat %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine %>% sqrt pam1 <- clusterboot(data = distMat, B = 500, bootmethod = "boot", clustermethod = pamkCBI, seed = 14551, count = FALSE) dfG2 <- data.frame(speeches = attr(pam1$result$result$pamobject$clustering, which = "names"), groups = pam1$result$result$pamobject$clustering, stringsAsFactors = FALSE) ###### Characters 4 grams stylDfRaw <- speechListCleanEn %>% map(~ quanteda::tokenize(x = .x, removePunct = TRUE, simplify = TRUE, what = "character")) %>% map(~ ngrams(x = ., n = 4, concatenator = "")) %>% map(~ table(.x)) %>% map(~ as.data.frame(.x)) %>% map2(.x = ., .y = as.list(names(speechListCleanEn)), .f = mutate) %>% rbindlist %>% rename(name = `.y[[i]]`, fourGramChar = `.x`, totalCounts = Freq) topCharsFreq <- stylDfRaw %>% group_by(fourGramChar) %>% summarise(wordFreq = sum(totalCounts)) %>% arrange(-wordFreq) %>% head(40) %>% select(fourGramChar) %>% unlist %>% unname topCharsDoc <- stylDfRaw %>% group_by(fourGramChar) %>% summarise(charPresentInSpeeches = n()) %>% arrange(-charPresentInSpeeches) %>% filter(charPresentInSpeeches/max(charPresentInSpeeches) >= .5) %>% select(fourGramChar) %>% unlist %>% unname charsToUse <- data.frame(fourGramChar = union(topCharsFreq, topCharsDoc), stringsAsFactors = FALSE) stylDf <- stylDfRaw %>% inner_join(charsToUse) %>% group_by(name, fourGramChar) %>% summarise(totalCounts = sum(totalCounts)) %>% ungroup %>% spread(fourGramChar, totalCounts, fill = 0) %>% data.frame row.names(stylDf) <- unique(stylDf$name) cmd_df1 <- stylDf %>% select(-name) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale(eig = TRUE) %>% autoplot(colour = "#005A31", size = 3) + ggtitle("Multi-Dimensional Scaling with Cosine Delta on Top 1,080 Most Frequent Character 4-grams") + theme_minimal() + theme(text = element_text(family = "Calibri", colour = "grey30"), plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2)), panel.grid = element_blank(), panel.border = element_blank(), axis.text.y = element_text(size = rel(1.4)), axis.text.x = element_text(size = rel(1.4))) ggsave(paste(basePath, "Graphs", "nss_mfcMds_g1.png", sep = "/"), plot = cmd_df1, height = 8, width = 15, units='in', dpi=400) ##### Character 4grams Consensus Network # I need 100 features for use in dissimilarity calculation for 1st batch, cumulatively summing up to 1,080 total features for the final batch featToUse <- 90 noOfBatch <- nrow(charsToUse)/featToUse splitAt <- as.list(cumsum(rep(featToUse, times = noOfBatch))) # Function to split our data set into batches splitDfBatch <- function(toIncludeLen, chars, df){ colsInd <- seq(from = 1, by = 1, to = toIncludeLen) colsToUse <- gsub("\\s+", "\\.", chars[colsInd]) dfBatch <- df[, names(df) %in% colsToUse] return(dfBatch) } posBgDfBatches <- map(.x = splitAt, .f = splitDfBatch, chars = unlist(charsToUse), df = stylDf) %>% map(.x = ., .f = as.matrix) %>% map(.x = ., .f = scale, center = TRUE, scale = TRUE) %>% map(.x = ., .f = dist.cosine) %>% map(.x = ., .f = as.matrix) %>% map(.x = ., .f = melt) %>% map2(.x = ., .y = setNames(as.list(seq(from = 1, to = noOfBatch)), nm = rep("batch", times = noOfBatch)), .f = mutate) %>% map(.f = function(x) filter(x, Var1 != Var2)) %>% map(.f = function(x) group_by(.data = x, Var1)) %>% map(.f = function(x) top_n(x, n = -5, wt = value)) %>% rbindlist %>% rename(doc1 = Var1, doc2 = Var2, distance = value, batch = `.y[[i]]`) %>% arrange(doc1, batch, distance) %>% ungroup %>% setDT simDf <- posBgDfBatches[ ][, presentInBatch := length(unique(batch)), by = .(doc1, doc2) ][presentInBatch >= 3 ][, .(avgDistance = mean(distance)), by = .(doc1, doc2) ][, avgSimilarity := 1/avgDistance ][, .(doc1, doc2, avgSimilarity)] verticesDf <- data.frame(name = names(speechListCleanEn), dateOfSpeech = as.character(ymd(gsub("^([[:digit:]]+)-.+$", "\\1", names(speechListCleanEn)))), stringsAsFactors = FALSE) %>% mutate(speechCode = sprintf("%s - %d", "Speech", seq_along(name))) %>% setDT set.seed(1234) g1 <- graph_from_data_frame(d = simDf, directed = FALSE, vertices = verticesDf) g1 <- simplify(g1, edge.attr.comb = "sum") wtc <- cluster_walktrap(g1) modularity(wtc); modularity(g1, membership(wtc)) dfG1 <- data.frame(speeches = unclass(wtc)$names, groups = unclass(wtc)$membership, stringsAsFactors = FALSE) #this gives membership of each speech dfG2 <- inner_join(dfG1, verticesDf, by = c("speeches" = "name")) posNDf <- ggnetwork(g1, layout="fruchtermanreingold", arrow.gap=0, cell.jitter=0) %>% setDT posNDf[, dateOfSpeechInt := as.integer(posNDf$dateOfSpeech)] gcn_4c1 <- ggplot(data = posNDf) + geom_edges(aes(x = x, y = y, xend = xend, yend = yend, alpha = avgSimilarity), color = "grey50", curvature = 0.1, size = 1) + geom_nodes(aes(x = x, y = y, xend = xend, yend = yend, colour = dateOfSpeechInt), size = 5) + scale_color_distiller(palette = "Greens", direction = 1, guide = guide_colourbar(ticks = FALSE, barwidth = 10, barheight = .4, title = "Recency of Speech", title.position = "top", title.hjust = 1, title.vjust = 1, label.position = "top", direction = "horizontal", label = FALSE)) + geom_text_repel(data = unique(posNDf[, .(x, y, speechCode)]), aes(x = x, y = y, label = speechCode), colour = "grey50") + xlab("") + ylab("") + ggtitle("Consensus Network of the Prime Minister's Speeches using Character 4-grams") + theme(text = element_text(family = "Garamond"), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), plot.background = element_rect(fill = "white"), legend.background = element_blank(), legend.key = element_blank(), legend.position = "top", plot.title = element_text(hjust = 0, family = "Garamond", size = rel(2), colour = "grey30")) + guides(alpha = guide_legend(title = "Stylistic Similarity", title.position = "top", title.hjust = 1, title.vjust = 1, ticks = FALSE, direction = "horizontal", label = FALSE, keywidth = 3.1, keyheight = 1)) ggsave(paste(basePath, "Graphs", "bcn_char4g_2.png", sep = "/"), height = 8, width = 15, units='in', dpi = 400) ##### Combining all all_df1 <- inner_join(stylDf, selm_df1) %>% select(-name, -dateOfSpeech) %>% as.matrix %>% scale(center = TRUE, scale = TRUE) %>% dist.cosine() %>% cmdscale %>% data.frame cmd_df1$name <- 1:nrow(cmd_df1) ggplot(data = cmd_df1, aes(x = X1, y = X2)) + geom_text(aes(label = name)) cmd_df1 <- stylDf %>% select(-name, -dateOfSpeech) %>% mutate_each(funs((. - min(.))/(max(.) - min(.)))) %>% as.matrix %>% dist.cosine %>% cmdscale %>% data.frame cmd_df1$name <- 1:nrow(cmd_df1) ggplot(data = cmd_df1, aes(x = X1, y = X2)) + geom_text(aes(label = name)) pca1 <- stylDf %>% select(-name, -dateOfSpeech) %>% PCA(ncp = 10) # Speech Level Measures # 1. Type-Token Ratio (Herdan's C) # Character n-grams for most frequent ones # Word n-grams for most frequent ones # Test these separately, combined, do voting to find clusters # Maybe after clusters are formed, apply arules on most frequent words to find any differences, or form a writeprint by combining # different features (all you mentioned above + mfw) and then use arules to find associations, as per this paper:http://www.sciencedirect.com/science/article/pii/S1742287615000572 # commas are not very indicative since they appear in numbers too # enron authorship attribution = is success due to ngrams? yes: https://www.uvic.ca/engineering/ece/isot/assets/docs/Authorship_Verification_for_Short_Messages_using_Stylometry.pdf character ngram (4) and word ngrams (2, 3) http://www.aicbt.com/authorship-attribution/ also read 5 (Authorship similarity detection from email messages), 16 (Authorship verification as a one-class classification problem) 2, 4, 3,5,6,7 8 Chaski 2001 # authorship attrubtion clustering # try clustering algos for DNAs # try tsne, mds, arules, pam # feature engineerning: #stylistic: POS tagging, length of sentences, avg length of words, richness of vocabulary, word n-grams, lexical diversity (unique no. of words), lexical density (Number of different words / Total number of words) x 100, comma in speech, comma per sentence also, do not take less common words The Fog Index is a readability test designed to show how easy or difficult a text is to read. It uses the following formula: Reading Level (Grade) = (Average No. of words in sentences + Percentage of words of three or more syllables) x 0.4 # simply using function words may not suffice, since our documents are not long enough ######## Humans acquire habits in their usage of language, which are exhibited in speeches, etc. Such stylistic and vocabulary patterns combine to form a fingerprint lexical density has also been graphed in your charts... most have > 40% but mostly less than 50% # scale_fill_manual(values = c("#98dbc6", "#e6d72a")) # scale_fill_manual(values = c("#f4cc70", "#20948b")) # scale_fill_manual(values = c("#5d535e", "#ec96a4")) # scale_fill_manual(values = c("#063852", "#e6df44")) # scale_fill_manual(values = c("#500805", "#bc6d4f")) ### Stage 4: Identify words that are specific to each speech in order to remove them --- we need stylistic patterns specWords <- data.frame(name = names(speechListCleanEn), speechNumber = 1:length(speechListCleanEn), speech = unlist(speechListCleanEn), stringsAsFactors = FALSE) %>% unnest_tokens(output = word, input = speech) %>% group_by(word, name) %>% summarise(countWithinSpeech = n()) %>% ungroup %>% group_by(word) %>% summarise(foundInSpeeches = length(unique(name)), overallCount = sum(countWithinSpeech)) speechListCleanEn %>% #unlist %>% data.frame(speech = unlist(.), stringsAsFactors = FALSE) %>% nrow