#' Import dependencies:
require(parallel)
require(RColorBrewer)
require(wordcloud)
require(wordcloud2)
require(htmlwidgets)
require(webshot)
webshot::install_phantomjs()

#' Usage:
message("\nUsage:\nRscript prot-scriber-word-cloud.R input-prot-scriber-table.txt output-files-name (the file-name without file-extension)\n")

#' Get arguments:
script.args <- commandArgs(trailingOnly = TRUE)
#' Validate:
if (length(script.args) != 2) {
    message("Please provide two command line arguments. See above 'Usage'.")
    stop()
}
prot.scriber.inp.tbl <- script.args[[1]]
output.files.name <- script.args[[2]]

#' Speed up computation:
options(mc.cores = (detectCores() - 1))

#' For reproducibility:
set.seed(1234)

#' Load prot-scriber result table:
ps.tbl <- read.table(prot.scriber.inp.tbl, sep = "\t",
    header = TRUE, stringsAsFactors = FALSE)


#' Exclude 'unknown protein' or 'unknown sequence family':
ps.tbl.clean <- ps.tbl[which(!ps.tbl$Human.Readable.Description %in%
    c("unknown protein", "unknown sequence family")),
    ]

#' Exclude the following words as non informative in terms of the word-cloud:
non.inf.words <- c("protein", "containing", "family",
    "domain", "isoform", "subunit", "associated", "duf",
    "factor", "dependent", "binding", "of", "and",
    "or")


#' Extract words and their frequencies:
all.words <- unlist(lapply(ps.tbl.clean$Human.Readable.Description,
    function(hrd) strsplit(hrd, split = " ")[[1]]))
unq.wrds <- setdiff(unique(all.words), non.inf.words)
#' Exclude only numbers or single letters:
unq.wrds <- unq.wrds[!grepl("^(\\d+|[a-z])$", unq.wrds,
    perl = TRUE)]
words.df <- data.frame(word = unq.wrds, freq = as.numeric(mclapply(unq.wrds,
    function(wrd) {
        length(which(all.words == wrd))
    })), stringsAsFactors = FALSE)


#' Generate word-cloud:
pdf(paste0(output.files.name, "_word_cloud_type_one.pdf"))
wordcloud(words = words.df$word, freq = words.df$freq,
    min.freq = quantile(words.df$freq, 1/3), max.words = 200,
    random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8,
        "Dark2"))
dev.off()


#' Generate word-cloud 2:
wrd.cld.2 <- wordcloud2(data = words.df, size = 1.6,
    color = "random-dark")
#' save as HTML:
html.out <- paste0(output.files.name, "_word_cloud_type_two.html")
saveWidget(wrd.cld.2, html.out, selfcontained = F)
#' and as PDF:
webshot(html.out, paste0(output.files.name, "_word_cloud_type_two.pdf"),
    delay = 5, vwidth = 480, vheight = 480)


#' The End
message("DONE")