##########################################################################################
## HarvardX Data Science Capstone CYOP Project Skillability.
## 
## Name: skillability.r
## Description: This file listing provides the data exploration, visualisation, modeling,
##              method and result analysis for the Stackoverflow data.
## Recommendation: Unix-like environments to be able to run the parallel multi-core 
##                 implementations needed for caret::train(...) and parallel::mclapply(...)
## Requirements: R 3.6.x installed with all required packages. 
## Author: Giovanni Azua Garcia <giovanni.azua@outlook.com>
## Code License: AGPL v3.0 https://www.gnu.org/licenses/agpl-3.0.en.html
## Data License: CC BY-SA 4.0 https://creativecommons.org/licenses/by-sa/4.0/
## References: https://projecteuclid.org/download/pdf_1/euclid.ss/1032280214
##             http://statweb.stanford.edu/~ckirby/brad/papers/2018Automatic-Construction-BCIs.pdf
##             http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/R/R-Manual/R-Manual16.html
##########################################################################################

# clean the environment
rm(list = ls())

# trigger garbage collection and free some memory if possible
gc(TRUE, TRUE, TRUE)

##########################################################################################
## Install and load required library dependencies
##########################################################################################

defaultRepos <- "http://cran.us.r-project.org"

if(!require(tidyverse)) install.packages("tidyverse", repos = defaultRepos)
if(!require(caret)) install.packages("caret", repos = defaultRepos)
if(!require(boot)) install.packages("boot", repos = defaultRepos)
if(!require(purrr)) install.packages("purrr", repos = defaultRepos)
if(!require(data.table)) install.packages("data.table", repos = defaultRepos)
if(!require(tictoc)) install.packages("tictoc", repos = defaultRepos)
if(!require(lubridate)) install.packages("lubridate", repos = defaultRepos)
if(!require(stringr)) install.packages("stringr", repos = defaultRepos)
if(!require(parallel)) install.packages("parallel", repos = defaultRepos)
if(!require(ggplot2)) install.packages("ggplot2", repos = defaultRepos)
if(!require(ggmap)) install.packages("ggmap", repos = defaultRepos)
if(!require(ggrepel)) install.packages("ggrepel", repos = defaultRepos)
if(!require(scales)) install.packages("scales", repos = defaultRepos)
if(!require(RColorBrewer)) install.packages("RColorBrewer", repos = defaultRepos)
if(!require(Metrics)) install.packages("Metrics", repos = defaultRepos)
if(!require(rstudioapi)) install.packages("rstudioapi", repos = defaultRepos)
if(!require(here)) install.packages("here", repos = defaultRepos)

##########################################################################################
## Setup initial global values
##########################################################################################

# check the platform OS type
if(.Platform$OS.type == "unix") {
  # use doMC when it's an Unix-like OS
  if(!require(doMC)) install.packages("doMC", repos = defaultRepos)

  # register cores for parallel processing
  ncores <- detectCores()
  registerDoMC(ncores)
  
} else {
  # NOTE! pending to port multi-core code to Windows
  ncores <- 1
}

# best attempt to set the working path to this file's path
if (rstudioapi::isAvailable()) {
  currentPath <- dirname(rstudioapi::getActiveDocumentContext()$path)
} else {
  # not very accurate, will default to home folder and not to this file's path
  currentPath <- here()
}
setwd(currentPath)
print(getwd())

##########################################################################################
## Define important reusable functions e.g. the portable.set.seed(...)
##########################################################################################

# Portable set.seed function (across R versions) implementation
# @param seed the seed number
portable.set.seed <- function(seed) {
  if (R.version$minor < "6") {
    set.seed(seed)
  } else {
    set.seed(seed, sample.kind="Rounding")
  }
}

# Returns the file path for the given object name.
#
# @param objectName the name of the object e.g. "Users"
# @param prefixDir the prefix directory where all data is stored e.g. "data"
# @param rdsDir the directory where the RDS files are located e.g. "rds"
# @param ext the extension for the RDS files i.e. ".rds"
# @returns the file path for the given dataset name.
filePathForObjectName <- function(objectName, prefixDir="data", 
                                  rdsDir="rds", ext=".rds") {
  rdsFolder <- file.path(prefixDir, rdsDir)
  if (!dir.exists(rdsFolder)) {
    dir.create(rdsFolder, recursive = T)
  }
  fileName <- paste(objectName, ext, sep="")
  filePath <- file.path(rdsFolder, fileName)
  return(filePath)  
}

# Returns the object (dataset or otherwise) by name, it will either load the dataset from an 
# RDS file if it exists or download it from GitHub automatically. If downloaded then the file 
# will be created in the expected location so that we won't be downloading it again.
#
# @param objectName the name of the dataset e.g. "Users"
# @param prefixDir the prefix directory where all data is stored e.g. "data"
# @param rdsDir the directory where the RDS files are located e.g. "rds"
# @param ext the extension for the RDS files i.e. ".rds"
# @param baseUrl the base GitHub url where the data is located.
# @param userName the GitHub user name e.g. "bravegag"
# @param repoName the GitHub repository name e.g. "HarvardX-Skillability"
# @param branchName the GitHub branch name e.g. "master"
# @returns the dataset by name.
readObjectByName <- function(objectName, prefixDir="data", rdsDir="rds", ext=".rds", 
                             userName="bravegag", repoName="HarvardX-Skillability", branchName="master",
                             baseUrl="https://github.com/%s/%s/blob/%s/data/rds/%s?raw=true") {
  tryCatch({
    filePath <- filePathForObjectName(objectName = objectName, prefixDir = prefixDir, 
                                      rdsDir = rdsDir, ext = ext)
    fileName <- basename(filePath)
    if (!file.exists(filePath)) {
      # download the file
      url <- sprintf(baseUrl, userName, repoName, branchName, fileName)
      cat(sprintf("downloading \"%s\"\n", url))
      download.file(url, filePath, extra="L", mode="wb")
    } else {
      cat(sprintf("object \"%s\" exists, skipping download ...\n", filePath))
    }
    return(readRDS(filePath))
  }, warning = function(w) {
    cat(sprintf("WARNING - attempting to access or download the %s data:\n%s\n", 
                objectName, w))
    return(NULL)
  }, error = function(e) {
    cat(sprintf("ERROR - attempting to access or download the %s data:\n%s\n", 
                objectName, e))
    return(NULL)
  }, finally = {
    # nothing to do here
  })  
}

# Saves the object (dataset or otherwise) by name, the required folders will be 
# created if they don't already exist.
#
# @param object the object e.g. tibble or data frame
# @param objectName the name of the object e.g. "Users"
# @param prefixDir the prefix directory where all data is stored e.g. "data"
# @param rdsDir the directory where the RDS files are located e.g. "rds"
# @param ext the extension for the RDS files i.e. ".rds"
saveObjectByName <- function(object, objectName, prefixDir="data", 
                             rdsDir="rds", ext=".rds") {
  filePath <- filePathForObjectName(objectName = objectName, prefixDir = prefixDir, 
                                    rdsDir = rdsDir, ext = ext)
  saveRDS(object=object, file=filePath)
}

##########################################################################################
## Data exploration and visualization: Basic
##########################################################################################

# load the Users, Questions, Answers, Badges and Tags data files
users <- readObjectByName("Users")
questions <- readObjectByName("Questions")
answers <- readObjectByName("Answers")
badges <- readObjectByName("Badges")
tags <- readObjectByName("Tags")

# what's the question with highest score? 11227809
# https://stackoverflow.com/questions/11227809/
questions %>% 
  top_n(1, score) %>%
  select(questionId, score, answerCount, commentCount, favoriteCount, tags, creationDate)

# what's the answer with highest score? 11227809
answers %>% 
  top_n(1, score) %>%
  select(answerId, score, commentCount, creationDate)

# what's the top user? 22656
# https://stackoverflow.com/users/22656/jon-skeet
users %>%
  top_n(1, reputation) %>%
  select(userId, reputation, creationDate, location, upvotes, downvotes)

# what are the top ten tags / skills?
topTenSkills <- tags %>%
  top_n(10, count) %>%
  arrange(desc(count)) %>%
  rename(skill=tag)
topTenSkills

# what's the average user reputation?
users %>%
  summarise(mean=mean(reputation), median=median(reputation))

# what's the average number of questions per user? 5.35
questions %>%
  group_by(userId) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  summarise(mean=mean(n), median=median(n))

# what's the average number of answers per user? 26.3
answers %>%
  group_by(userId) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  summarise(mean=mean(n), median=median(n))

# what's the average number of answers per question? 2.27
questions %>%
  summarise(mean=mean(answerCount), median=median(answerCount))

# histogram of the log10-transformed of users reputation: positively skewed!
# NOTE! that we excluded the users (located in Switzerland) with reputation less than 999
users %>%
  filter(reputation > 999) %>%
  mutate(reputation=log10(reputation)) %>%
  ggplot(aes(reputation)) + 
  geom_histogram(bins = 200, colour="#377EB8", fill="#377EB8") +
  ggtitle("Users log10 reputation histogram") +
  xlab("log10 reputation") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.text = element_text(size=12),
        axis.text.x = element_text(angle = 45, hjust = 1))

# histogram of the log10-transfored of users reputation per badge and 
# excluding users with less than 999 reputation 
users %>%
  filter(reputation > 999) %>%
  mutate(reputation=log10(reputation)) %>%
  inner_join(badges %>% 
               select(userId, badge) %>% 
               filter(badge %in% c("Populist", "Great Answer", "Guru", "Great Question", 
                                   "Good Answer", "Good Question", "Nice Answer", 
                                   "Nice Question")), by="userId") %>%
  mutate(badge=factor(badge, levels=c("Populist", "Great Answer", "Guru", "Great Question", 
                                      "Good Answer", "Good Question", "Nice Answer", 
                                      "Nice Question"))) %>%
  ggplot(aes(reputation, group=badge, color=badge, fill=badge)) + 
  geom_histogram(bins = 200) +
  ggtitle("Users log10 reputation histogram") +
  xlab("log10 reputation") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.text = element_text(size=12)) +
  facet_wrap(~badge)

##########################################################################################
## Data exploration and visualization: Top skills & technology trends
##########################################################################################

# select the top 2k tags/skills by count
mainSkills <- tags %>% 
  top_n(2000, count) %>%
  rename(skill=tag) %>% 
  arrange(desc(count))

# what's the proportion to the total? 82.3%
100*sum(mainSkills$count)/sum(tags$count)

# smaller subset of questions matching the main tags
questionSkills <- questions %>% 
  filter(score > 9 & viewCount > 99 & answerCount > 1)
# takes ~35s
tic(sprintf('separating rows with %d', nrow(questionSkills)))
questionSkills <- questionSkills %>%
  select(questionId, tags) %>%
  separate_rows(tags, sep="\\|") %>%
  rename(skill=tags) %>%
  inner_join(mainSkills, by="skill") %>%
  arrange(desc(count)) %>%
  select(questionId, skill)
str(questionSkills)
toc()

# takes ~15m if TRUE
if (FALSE) {
  tic(sprintf('computing co-occurrence matrix with %d question-skill', 
              nrow(questionSkills)))
  X <- crossprod(table(questionSkills[1:2]))
  diag(X) <- 0
  toc()
  saveObjectByName(X, "XCo-occurrence")
}
X <- readObjectByName("XCo-occurrence")

# how sparse is it?
sum(X == 0)/(dim(X)[1]^2)

# compute PCA 
pca <- prcomp(X)

# plot the variability of each of the vectors 
pc <- 1:50
qplot(pc, pca$sdev[pc])

# plot the variability explained 
var_explained <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
qplot(pc, var_explained[pc])

# create tibble containing the first four principal components
pcs <- tibble(skill = rownames(pca$rotation), PC1=pca$rotation[,"PC1"], 
              PC2=pca$rotation[,"PC2"])
# highlight the top ten tags
pcs <- pcs %>%
  mutate(fontface=ifelse(skill %in% (topTenSkills %>% pull(skill)), 
                         'bold', 'plain'))

technologies <- c("Blockchain, Cloud, Build & Data Viz",
                  "Full Stack", 
                  "Web Frontend & Mobile",
                  "Microsoft Stack",
                  "Python & C++",
                  "Software Engineering",
                  "Javascript",
                  "iOS Stack",
                  "Other")

# choose 9 colors: 2x4 components plus everything else
colorPalette <- RColorBrewer::brewer.pal(name='Set1', n=9)
colorSpec <- colorPalette[1:9]
names(colorSpec) <- technologies

# maximum tags to choose in each direction
M <- 25
highlight <- pcs %>% 
  arrange(PC1) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[1], pc=1)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(desc(PC1)) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[2], pc=1) %>%
  bind_rows(highlight)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(PC2) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[3], pc=2) %>%
  bind_rows(highlight)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(desc(PC2)) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[4], pc=2) %>%
  bind_rows(highlight)

nonHighlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  mutate(Technology=technologies[9])

# switch to the 3rd and 4rth PCA components
pcs <- tibble(skill = rownames(pca$rotation), PC3=pca$rotation[,"PC3"], 
              PC4=pca$rotation[,"PC4"])
# highlight the top ten tags
pcs <- pcs %>%
  mutate(fontface=ifelse(skill %in% (topTenSkills %>% pull(skill)), 'bold', 'plain'))

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(PC3) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[5], pc=3) %>%
  bind_rows(highlight)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(desc(PC3)) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[6], pc=3) %>%
  bind_rows(highlight)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(PC4) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[7], pc=4) %>%
  bind_rows(highlight)

highlight <- pcs %>% 
  anti_join(highlight, by="skill") %>%
  arrange(desc(PC4)) %>% 
  slice(1:M) %>%
  mutate(Technology=technologies[8], pc=4) %>%
  bind_rows(highlight)

# plot the components in log scale
highlightLog <- highlight %>% 
  mutate(PC1=sign(PC1)*log10(abs(PC1)), 
         PC2=sign(PC2)*log10(abs(PC2)))

nonHighlightLog <- nonHighlight %>% 
  mutate(PC1=sign(PC1)*log10(abs(PC1)), 
         PC2=sign(PC2)*log10(abs(PC2)))

portable.set.seed(1)
highlightLog %>% 
  filter(pc %in% c(1, 2)) %>%
  ggplot(aes(PC1, PC2, label=skill, colour=Technology)) +
  geom_jitter(alpha = 0.4, size = 3) + 
  theme(legend.position="bottom", plot.title = element_text(hjust = 0.5), 
        legend.text=element_text(size=12)) + 
  guides(fill = guide_legend(nrow=2)) +
  xlab(sprintf("sign(PC1) x log10|PC1| - Variance explained %d%%", round(100*pca$sdev[1]^2 / sum(pca$sdev^2)))) + 
  ylab(sprintf("sign(PC2) x log10|PC2| - Variance explained %d%%", round(100*pca$sdev[2]^2 / sum(pca$sdev^2)))) +
  ggtitle("Top Tags (i.e. Skills) in each direction of the first two Principal Components PC1 and PC2") +
  geom_text_repel(aes(fontface=fontface), segment.alpha = 0.3, size = 5,
                  force = 7, nudge_x = 0.1, nudge_y = 0.1, seed = 1, family="Arial") +
  scale_colour_manual(values = colorSpec) +
  scale_x_continuous(limits=c(-8, 8)) +   
  scale_y_continuous(limits=c(-8, 8)) +  
  geom_jitter(data = nonHighlightLog, aes(PC1, PC2), alpha = 0.05, size = 2)

##########################################################################################
## Data exploration and visualization: Geo-context
##########################################################################################

# try to access or download the data file first
usersCh <- readObjectByName("UsersCH")
# do this only if the file isn't there to avoid costly Google geomapping calls
if (is.null(usersCh) && !file.exists(filePathForObjectName("UsersCH"))) {
  # the environment variable GOOGLE_API_KEY is required or simply copy-paste your
  # google key instead. To obtain a google key, follow the steps outlined here:
  # https://developers.google.com/maps/documentation/javascript/get-api-key
  register_google(key=Sys.getenv("GOOGLE_API_KEY"))
  
  # get users whose location is Switzerland only
  usersCh <- users %>%
    filter(str_detect(tolower(location), 
                      "(\\bch\\b|switzerland|schweiz|svizzera|suisse|svizra)")) %>%
    arrange(desc(reputation))
  
  # get the unique locations so that we avoid duplicate calls e.g. "Zurich, Switzerland"
  swissLocations <- usersCh %>%
    select(location) %>%
    unique()
  # WARNING! this code paired with a valid GOOGLE_API_KEY may cost money!
  swissLocations <- mutate_geocode(swissLocations, location = location)
  usersCh <- usersCh %>%
    left_join(swissLocations, by="location")
  
  # write the usersCh to disk
  saveObjectByName(usersCh, "UsersCH")
}
# expected number of users located in Switzerland
stopifnot(nrow(usersCh) == 4258)

# get the top answer skills for users located in Switzerland 
topAnswerTags <- answers %>%
  semi_join(usersCh, by="userId") %>%
  group_by(userId) %>%
  summarise(score=max(score)) %>%
  ungroup() %>%
  inner_join(answers %>% select(userId, score, questionId), by=c("userId", "score")) %>%
  inner_join(questions %>% select(questionId, tags), by="questionId") %>%
  group_by(userId) %>%
  summarise(questionId=first(questionId), score=first(score), tags=first(tags)) %>%
  ungroup() %>%
  mutate(type='answer') %>%
  arrange(desc(score))

# testing the results
nrow(topAnswerTags)
topAnswerTags %>%
  head(20)
# double-check results on my userId
topAnswerTags %>% filter(userId == 1142881)

# check for duplicates
topAnswerTags %>%
  group_by(userId) %>%
  filter(n() > 1)

# otherwise get top questions for users located in Switzerland
topQuestionTags <- questions %>%
  semi_join(usersCh, by="userId") %>%
  anti_join(topAnswerTags, by="userId") %>%
  group_by(userId) %>%
  summarise(score=max(score)) %>%
  ungroup() %>%
  inner_join(questions %>% select(userId, score, questionId, tags), by=c("userId", "score")) %>%
  group_by(userId) %>%
  summarise(questionId=first(questionId), score=first(score), tags=first(tags)) %>%
  ungroup() %>%
  mutate(type='question') %>%
  arrange(desc(score))

# testing the results
nrow(topQuestionTags)
topQuestionTags %>%
  head(20)

# check for duplicates
topQuestionTags %>%
  group_by(userId) %>%
  filter(n() > 1)

# merge the two data sets
usersChTop <- topAnswerTags %>%
  bind_rows(topQuestionTags) %>%
  mutate(type=as.factor(type)) %>%
  left_join(usersCh %>% select(userId, location, lon, lat), by="userId") %>%
  separate_rows(tags, sep="\\|") %>%
  rename(skill=tags) %>%
  inner_join(mainSkills, by="skill") %>%
  select(questionId, userId, score, skill, type, location, lon, lat)
str(usersChTop)

# link to the principal component highlights, remove others
usersChTop <- usersChTop %>%
  left_join(highlight %>% select(skill, pc, Technology), by=c("skill")) %>%
  select(questionId, userId, score, skill, Technology, type, location, lon, lat) %>%
  filter(!is.na(Technology) & Technology != technologies[9]) %>%
  arrange(desc(score))
str(usersChTop)

# double-check results on my userId
usersChTop %>% filter(userId == 1142881)

# double-check the top scores
usersChTop %>% 
  arrange(desc(score))

# try to access or download it first
map <- readObjectByName("SwissMap")
# do this only if the file isn't there to avoid costly Google map calls
if (is.null(map) && !file.exists(filePathForObjectName("SwissMap"))) {
  # the environment variable GOOGLE_API_KEY is required or simply copy-paste your
  # google key instead. To obtain a google key, follow the steps outlined here:
  # https://developers.google.com/maps/documentation/javascript/get-api-key
  register_google(key=Sys.getenv("GOOGLE_API_KEY"))
  
  # get Google map of Switzerland
  center <- c(lon = 8.227512, lat = 46.818188)
  map <- get_googlemap(center = center, zoom = 7,
                       color = "bw",
                       maptype = "terrain",
                       style = paste("feature:road|visibility:off&style=element:labels|",
                                     "visibility:off&style=feature:administrative|visibility:on|lightness:60", 
                                     sep=""))
  saveObjectByName(map, "SwissMap")
}

# plot the top technology trends in Geo-context in Switzerland
ggmap(map) + 
  scale_colour_manual(values = colorSpec) +
  geom_point(data=usersChTop, aes(x=lon, y=lat, colour=Technology, size=score), 
             position = position_jitterdodge(jitter.width=0.01, jitter.height=0.01, seed=1)) +
  ggtitle("SO Users located in Switzerland and their matching technology trends, weighted by score") +
  theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=12))

# reveal the details of the top ten most prominent data points
usersChTop %>%
  top_n(10, score) %>%
  arrange(desc(score))

##########################################################################################
## Data exploration and visualization: Badges, Answers and Questions
##########################################################################################

# Function to accumulates all badges into a final dataset. Every next call
# should pass the accumulated results so that they can be excluded from 
# the selection.
#
# @param aBadge the badge to filter for. 
# @param acc the accumulated results (result of previous call to this function).
# @param N the top N values to pick e.g. 1500
#
accumulateBadges <- function(aBadge, acc=NULL, N=1500) {
  cat(sprintf("adding \"%s\" to the cumulative badges\n", aBadge))
  res <- NULL
  # handle this non standard case separately
  if (aBadge == "Other Answers") {
    res <- users %>% 
      anti_join(acc %>% select(userId) %>% unique(), by="userId") %>%
      semi_join(answers %>% filter(0 <= score & score < 10) %>% 
                   select(userId) %>% unique(), by="userId") %>%
      mutate(class="bronze", badge=aBadge) %>%
      select(userId, class, badge, reputation) %>%
      bind_rows(acc)
        
  } else {
    # this is the case for the first time
    if (is.null(acc)) {
      res <- users %>% 
        inner_join(badges %>% filter(badge == aBadge) %>% 
                     select(userId, class, badge) %>% unique(), by="userId") %>%
        select(userId, class, badge, reputation)
    } else {
      res <- users %>% 
        anti_join(acc %>% select(userId) %>% unique(), by="userId") %>%
        inner_join(badges %>% filter(badge == aBadge) %>% 
                     select(userId, class, badge) %>% unique(), by="userId") %>%
        select(userId, class, badge, reputation) %>%
        bind_rows(acc)
    }
  }
  # sort by factor ordering
  res$class <- factor(as.character(res$class), levels=c("gold", "silver", "bronze"))
  return(res)
}

# let's check whether the badge system provides qualitatively a significative users' 
# segregation w.r.t reputation using answers and questions. Compare the users by badge
# gold vs silver vs bronze badges.

N <- 1500
badgesOrder <- c("Populist", "Great Answer", "Great Question", "Guru", 
                 "Good Answer", "Nice Answer", "Other Answers", 
                 "Good Question", "Nice Question")

# accumulate all badge selections into a complete set
for (i in 1:length(badgesOrder)) {
  if (i == 1) {
    comp <- accumulateBadges(badgesOrder[i])
  } else {
    comp <- accumulateBadges(badgesOrder[i], comp)
  }
}

# show how many users per class and badge
comp %>% 
  group_by(class, badge) %>%
  summarise(n = n())

# set the seed again
portable.set.seed(1)
# log10 transform reputation and sort by factor ordering
comp <- comp %>%
  filter(reputation > 999) %>%
  group_by(class, badge) %>%
  sample_n(N) %>%
  ungroup() %>%
  mutate(reputation=log10(reputation)) %>%
  mutate(badge = factor(badge, levels=badgesOrder)) 

# checkout the average reputation ordering by badge to get an idea though
# this is not the exact final ordering used due to the exclusion system.
users %>%
  inner_join(badges %>% select(userId, class, badge) %>% unique(), by="userId") %>%
  filter(badge %in% badgesOrder) %>%
  group_by(class, badge) %>%
  summarise(avg_reputation=median(reputation)) %>%
  arrange(desc(avg_reputation))

# create color specification for the different badges
colorSpec <- c("#f9a602", "#c0c0c0", "#cd7f32")
names(colorSpec) <- c("gold", "silver", "bronze")

selectedBadges <- c("Great Answer", "Good Answer", "Nice Answer")
summaryRep <- comp %>%
  group_by(class, badge) %>%
  summarise(median=median(reputation))
comp %>% 
  filter(badge %in% selectedBadges) %>%
  ggplot(aes(reputation, colour = class, fill = class, group = badge)) +
  geom_histogram(position = "dodge", bins = 40) +
  scale_fill_manual(values = colorSpec) +
  xlab("log10 reputation") +
  scale_colour_manual(values = colorSpec) +
  geom_vline(data=summaryRep %>% filter(badge %in% selectedBadges), 
             aes(xintercept=median, color=class), linetype="dashed")

# bootstrapping the median using 2x replications
B <- 2*N

# extend ggplot with a new boxplot stat_summary function to use bootstrapped BCa 
# for the notch confidence intervals
bootNotch <- function(values) {
  # usual quantile values
  res = data.frame(t(boxplot(values, plot=FALSE)$stats))
  colnames(res) = c("ymin","lower","middle","upper","ymax")
  
  # bootstrap and get lower + upper notches
  ci <- boot.ci(boot(values, statistic = function(x, index) median(x[index]), R=B), type="bca")
  res$notchlower = ci$bca[4]
  res$notchupper = ci$bca[5]
  return(res)
}

# since we're extending the boxplot, need to provide outliers calculation as well
outlierNotch <- function(values) {
  return(boxplot(values, plot=FALSE)$out)
}

# compute a summary frame of the data containing mean and median
summaryRep <- comp %>%
  group_by(badge, class) %>%
  summarise(mean=mean(reputation),
            median=median(reputation))
summaryRep

# set the seed again (we need it here to get predictable bootstrap results)
portable.set.seed(1)
# plot the badge & class combinations to match an ordering for the ratings
comp %>% 
  left_join(summaryRep, by=c("class", "badge")) %>%
  ggplot(aes(x=badge, y=reputation, colour=class, group=badge)) +
  ggtitle("Reputation for random samples of 1.5k users in each badge & class combination") +
  ylab(label = "log10 reputation") +
  theme(legend.position="bottom", plot.title = element_text(hjust = 0.5),
        legend.text=element_text(size=12), 
        axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_summary(fun.data = bootNotch, geom = "boxplot", notch = T) +
  stat_summary(fun.y = outlierNotch, geom = "point") +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 5) +
  scale_colour_manual(values = colorSpec) +
  geom_hline(data=summaryRep, aes(yintercept = median, color = class), 
             linetype = "dashed") +
  geom_jitter(alpha=0.05)

# generate all possible pair-wise rating combinations
c <- combn(badgesOrder, m=2)
# run Wilcoxon rank sum test for independent samples i.e. significance of 
# the difference in population median between non-normally distributed 
# unpaired groups. Print the pairs whose p-value is greater than 0.05 i.e. 
# the mean difference between groups is not significative at
# 95% confidence
for (i in 1:ncol(c)) {
  res <- comp %>%
    filter(badge == c[1,i] | badge == c[2,i]) %>%
    wilcox.test(reputation~badge, data=., paired=F, conf.int=T)
  if (res$p.value > 0.05) {
    cat(sprintf(paste0("The median rep. for users with badge ",
                       "'%s' and '%s' is NOT significatively ", 
                       "diff. with p-value=%.6f\n"), 
                c[1,i], c[2,i], res$p.value))
  }
}

##########################################################################################
## Load and explore the ratings dataset.
##########################################################################################

# free a bit of memory if possible
gc(TRUE, TRUE, TRUE)

# read the ratings and users dataset
ratings <- readObjectByName("Ratings")

# check the average
ratings %>%
  summarise(median = median(rating), mean = mean(rating), sd = sd(rating))

# checkout the ratings histogram, it's bell shaped
ratings %>% 
  ggplot(aes(rating, fill=..x..)) + geom_histogram() +
  scale_x_continuous(breaks = seq(1.5, 5, by=0.5)) +
  ggtitle("Histogram of the user skill ratings") +
  theme(legend.position="top", plot.title = element_text(hjust = 0.5),
        legend.text=element_text(size=12)) + 
  scale_fill_gradient("Legend", low = "#E41A1C", high = "#4DAF4A") +
  theme(legend.position="bottom", legend.title = element_blank())

# set the seed again
portable.set.seed(1)
# repeat the analysis after having the ratings, use BCa again
comp <- ratings %>%  
  left_join(users %>% select(userId, reputation) %>% unique(), by="userId") %>%
  mutate(reputation=log10(reputation)) %>%
  group_by(rating) %>%
  sample_n(N) %>%
  ungroup()

# compute data summary
summaryRep <- comp %>%
  group_by(rating) %>%
  summarise(mean=mean(reputation),
            median=median(reputation))

# set the seed again
portable.set.seed(1)
comp %>%
  ggplot(aes(x=rating, y=reputation, colour=rating, group=rating)) +
  ggtitle("Top 1.5k unique users in each rating group") +
  ylab(label = "log10 reputation") +
  theme(legend.position="bottom", plot.title = element_text(hjust = 0.5),
        legend.text=element_text(size=12), 
        axis.text.x = element_text(angle = 45, hjust = 1)) + 
  stat_summary(fun.data = bootNotch, geom = "boxplot", notch = T) +
  stat_summary(fun.y = outlierNotch, geom = "point") +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 5) +
  scale_color_gradient("Legend", low = "#E41A1C", high = "#4DAF4A") +
  geom_hline(data=summaryRep, aes(yintercept = median, color = rating), 
             linetype = "dashed", alpha=0.5) +
  geom_jitter(alpha=0.1)

# generate all possible pair-wise rating combinations
c <- combn(seq(1.5, 5.0, by=0.5), m=2)
# run Wilcoxon rank sum test for independent samples i.e. significance of 
# the difference in population median between non-normally distributed 
# unpaired groups. Print the pairs whose p-value is greater than 0.05 i.e. 
# the mean difference between groups is not significative at
# 95% confidence
for (i in 1:ncol(c)) {
  res <- comp %>%
    filter(rating == c[1,i] | rating == c[2,i]) %>%
    wilcox.test(reputation~rating, data=., paired=F, conf.int=T)
  if (res$p.value > 0.05) {
    cat(sprintf(paste0("The median rep. for users with rating ",
                       "%.1f and %.1f is NOT significatively ", 
                       "diff. with p-value=%.6f\n"), 
                c[1,i], c[2,i], res$p.value))
  }
}

##########################################################################################
## Split the ratings dataset into training and test sets.
##########################################################################################

# we shouldn't need these anymore
rm(users, questions, answers, badges, tags)

# free a bit of memory if possible
gc(TRUE, TRUE, TRUE)

# what's the number of unique users, skills and how sparse is it?
ratings %>% 
  summarise(users = n_distinct(userId), 
            skills = n_distinct(skill),
            n = n()) %>%
  mutate(sparsity=sprintf("%.1f%%", 100*n / (users*skills))) %>%
  select(users, skills, sparsity)

# split the ratings dataset into separate train and test sets
portable.set.seed(1)
testIndex <- createDataPartition(y = ratings$rating, times = 1,
                                 p = 0.1, list = FALSE)
trainSet <- ratings[-testIndex,]
tmp <- ratings[testIndex,]

# make sure userId and skill in test set are also in train set
testSet <- tmp %>%
  semi_join(trainSet %>% select(userId) %>% unique(), by="userId") %>%
  semi_join(trainSet %>% select(skill) %>% unique(), by="skill")
testSet

# add rows removed from test set back into the train set
removed <- tmp %>%
  anti_join(testSet, by=c("userId", "skill"))

trainSet <- trainSet %>% 
  bind_rows(removed)
rm(tmp, removed)

# test the results, the two sets must add up
stopifnot(nrow(trainSet) + nrow(testSet) == nrow(ratings))

##########################################################################################
## Explore the simpler baseline model first.
##########################################################################################

# global average
mu <- mean(trainSet$rating)
# compute predictions and RMSE
rmseResults <- tibble(method = "Just the average", RMSE = Metrics::rmse(trainSet$rating, mu))
rmseResults

lambda <- 4
# compute regularized user effects
userEffects <- trainSet %>%
  group_by(userId) %>%
  summarize(b_i = sum(rating - mu)/(n() + lambda))
# compute predictions and RMSE
predictedRatings <- trainSet %>%
  left_join(userEffects, by='userId') %>%
  mutate(pred=mu + b_i) %>%
  pull(pred)
rmseResults <- bind_rows(rmseResults,
                         tibble(method="Regularized User Effects",
                                RMSE = Metrics::rmse(predictedRatings, trainSet$rating)))
rmseResults

# compute regularized skill effects
skillEffects <- trainSet %>%
  left_join(userEffects, by='userId') %>%
  group_by(skill) %>%
  summarize(b_j = sum(rating - (mu + b_i))/(n() + lambda))
# compute predictions and RMSE
predictedRatings <- trainSet %>%
  left_join(userEffects, by='userId') %>%
  left_join(skillEffects, by='skill') %>%
  mutate(pred=mu + b_i + b_j) %>%
  pull(pred)
rmseResults <- bind_rows(rmseResults,
                         tibble(method="Regularized User + Skill Effects",
                                 RMSE = Metrics::rmse(predictedRatings, trainSet$rating)))
rmseResults

# show the effects of number of week blocks since first post 
weeksBlock <- 30
# this week blocks corresponds approximately to 7 months
round(weeksBlock / 4.34524)
# show the gaining experience over time effects 
trainSet %>%
  left_join(skillEffects, by='skill') %>%
  left_join(userEffects, by='userId') %>%
  mutate(residual=rating - (mu + b_i + b_j)) %>%
  mutate(week_block_30 = ceiling(as.duration(firstPostDate %--% creationDate) / dweeks(weeksBlock))) %>%
  arrange(desc(week_block_30)) %>%
  group_by(week_block_30) %>%
  summarise(b_e_Effect=mean(residual)) %>%
  ggplot(aes(week_block_30, b_e_Effect)) + geom_point() + 
  geom_smooth(color="red", span=0.3, method.args=list(degree=2))

# fit a loess smoothing to model the "experience gained" temporal effect
weeksBlockFit <- trainSet %>%
  left_join(userEffects, by='userId') %>%
  left_join(skillEffects, by='skill') %>%
  mutate(residual=rating - (mu + b_i + b_j)) %>%
  mutate(week = ceiling(as.duration(firstPostDate %--% creationDate) / dweeks(weeksBlock))) %>%
  group_by(week) %>%
  summarise(residual=mean(residual)) %>%
  loess(residual~week, data=., span=0.3, degree=2)

# compute predictions and RMSE
predictedRatings <- trainSet %>%
  left_join(userEffects, by='userId') %>%
  left_join(skillEffects, by='skill') %>%
  mutate(week = ceiling(as.duration(firstPostDate %--% creationDate) / dweeks(weeksBlock))) %>%
  mutate(pred=mu + b_i + b_j + predict(weeksBlockFit, .)) %>%
  pull(pred)
rmseResults <- bind_rows(rmseResults,
                         tibble(method="Regularized User + Skill + Experience Effects",
                                RMSE = Metrics::rmse(predictedRatings, trainSet$rating)))
rmseResults

##########################################################################################
## Compute the RMSE for the baseline model on the test set.
##########################################################################################

## TEST SET ACCESS ALERT! accessing the test set to compute RMSE.
predictedRatings <- testSet %>%
  left_join(userEffects, by='userId') %>%
  left_join(skillEffects, by='skill') %>%
  mutate(week = ceiling(as.duration(firstPostDate %--% creationDate) / dweeks(weeksBlock))) %>%
  mutate(pred=mu + b_i + b_j + predict(weeksBlockFit, .)) %>%
  pull(pred)
rmseValue <- Metrics::rmse(predictedRatings, testSet$rating)
cat(sprintf("baseline RMSE on test data is %.9f\n", rmseValue))
# check that we get reproducible results
stopifnot(abs(rmseValue - 0.653410415) < 1e-9)

##########################################################################################
## Create the CF Low-Rank Matrix Factorization (lrmf) model integrated with the caret 
## package. The model employs low-rank matrix factorization trained using SGD.
##
## This caret model specification follows the implementation details described here:
## https://topepo.github.io/caret/using-your-own-model-in-train.html
##########################################################################################

# Define the model lrmf (Collaborative Filtering Low-Rank Matrix Factorization)
lrmf <- list(type = "Regression",
             library = NULL,
             loop = NULL,
             prob = NULL,
             sort = NULL)

# Define the model parameters. Four different parameters are supported.
#
# @param K the number of latent dimensions
# @param maxGamma the maximum learning rate.
# @param lambda the regularizaton parameter applied to the different effects.
# @param sigma the standard deviation of the initial values.
#
lrmf$parameters <- data.frame(parameter = c("K", "maxGamma", "lambda", "sigma"),
                              class = c(rep("numeric", 4)),
                              label = c("K-Latent dimensions", "Max. Learning rate", "Lambda", "Sigma of initial values"))

# Define the required grid function, which is used to create the tuning grid (unless the user 
# gives the exact values of the parameters via tuneGrid)
lrmf$grid <- function(x, y, len = NULL, search = "grid") {
  K <- c(10, 11, 12)
  maxGamma <- c(0.05, 0.1, 0.11)
  lambda <- seq(0.05, 0.1, by=0.01)
  sigma <- c(0.05, 0.1, 0.15)
  
  # to use grid search
  out <- expand.grid(K = K,
                     maxGamma = maxGamma,
                     lambda = lambda,
                     sigma = sigma)
  
  if(search == "random") {
    # random search simply random samples from the expanded grid
    out <- out %>%
      sample_n(100)
  }
  return(out)
}

# Define the fit function so we can fit our model to the data.
#
# @param P the initial P matrix.
# @param Q the initial Q matrix.
# @param batchSize the number of samples to train with at every step.
# @param trackConv whether to track RMSE convergence of the algorithm.
# @param thresRatio threshold for the ratio of initial gamma to gamma.
# @param iterBreaks number of steps before tracking RMSE convergence.
# @param verbose whether to output extra convergence information.
# @param ncores the number of cores to use for parallel batches.
# @param maxIter maximum number of outer iterations.
# @param batchIter number of batch iterations of the SGD algorithm.
#
lrmf$fit <- function(x, y, wts, param, lev, last, weights, classProbs, P=NULL, Q=NULL, 
                     batchSize=1000*param$K, trackConv=F, perTrack=1, thresRatio=1000, 
                     iterBreaks=1, verbose=F, ncores=detectCores(), maxIter=250,
                     batchIter=ifelse(ncores == 1, 108, ncores^2*3), ...) {
  # check whether we have a correct x
  stopifnot("userId" %in% colnames(x))
  stopifnot("skill"  %in% colnames(x))
  stopifnot("rating" %in% colnames(x))
  stopifnot(all(x$rating == y))
  
  # save some parameters
  gamma  <- param$maxGamma                     # learning rate
  lambda <- param$lambda                       # regularization parameter
  K <- param$K                                 # number of latent dimensions
  N <- nrow(x %>% select(userId) %>% unique()) # number of users
  M <- nrow(x %>% select(skill)  %>% unique()) # number of skills
  
  # compute and save the global mean and sd
  globalMu    <- mean(x$rating)
  globalSigma <- sd  (x$rating)
  
  # compute the z-score
  x <- x %>% select(userId, skill, rating) %>%
    mutate(rating_z = (rating - globalMu) / globalSigma)

  # compute the user and skill effects
  skillEffects <- x %>% group_by(skill)  %>% summarise(b_s = mean(rating_z))
  userEffects  <- x %>% group_by(userId) %>% summarise(b_u = mean(rating_z))

  # indexing for users and skills
  userIndex  <- x %>% distinct(userId) %>% arrange(userId) %>% mutate(i = row_number())
  skillIndex <- x %>% distinct(skill)  %>% arrange(skill)  %>% mutate(j = row_number())
  
  # create the actual x
  x <- x %>%
    left_join(userIndex , by="userId") %>%
    left_join(skillIndex, by="skill") %>%
    select(i, j, rating, rating_z)

  # initialize P and Q to have the skill and user effects already encoded in
  # the columns are layout so that the computation is column-major aligned 
  if (is.null(P) || is.null(Q)) {
    P <- matrix(0, nrow = K, ncol = N)
    P[1,] <- matrix(1, nrow = 1, ncol = N)
    P[2,] <- as.matrix(userIndex %>% left_join(userEffects, by="userId") %>% select(b_u))
    P <- P + matrix(rnorm(K*N, mean = 0, sd = param$sigma), nrow = K, ncol = N)

    Q <- matrix(0, nrow = K, ncol = M)
    Q[1,] <- as.matrix(skillIndex %>% left_join(skillEffects, by="skill") %>% select(b_s))
    Q[2,] <- matrix(1, nrow = 1, ncol = M)
    Q <- Q + matrix(rnorm(K*M, mean = 0, sd = param$sigma), nrow = K, ncol = M)
  }

  # double-check the matrix dimensions
  stopifnot(nrow(P) == K)
  stopifnot(ncol(P) == N)
  stopifnot(nrow(Q) == K)
  stopifnot(ncol(Q) == M)
  
  # convenience function to compute the RMSE for a subset of the samples
  computeRMSE <- function(subsetSamples) {
    predicted <- globalMu + globalSigma*colSums(P[,subsetSamples$i]*Q[,subsetSamples$j])
    return(Metrics::rmse(predicted, subsetSamples$rating))
  }
  
  # prepare for parallel computation, blocks contain ncores number of groups
  # that can be each executed in parallel, each group trains a mutually
  # exclusive permutation of the training data tuples
  if (ncores > 1) {
    if (verbose) {
      cat(sprintf('running parallel version with %d cores\n', ncores))
    }
    bi <- rep(0:(ncores-1), ncores)
    bj <- (bi + rep(0:(ncores-1), 1, each = ncores)) %% ncores
    blocks <- tibble(bi=bi, bj=bj)
    
    # how many iterations each core will do
    parIter <- batchIter / ncores
    
    # include the block indexes bi and bj for parallel processing
    x <- x %>%
      mutate(bi = i %% ncores, bj = j %% ncores)
    
    # correct the batchSize if needed, the lower bound is the minimum
    # number of elements within each group
    lowerBound <- x %>% 
      group_by(bi, bj) %>% 
      summarise(n=n()) %>% 
      ungroup() %>%
      summarise(n=min(n)) %>% 
      pull(n)
    if (lowerBound < batchSize) {
      batchSize <- lowerBound
      if (verbose) {
        cat(sprintf('corrected the batch size to %d\n', batchSize))
      }
    }

  } else {
    if (verbose) {
      cat(sprintf('running sequential version\n', ncores))
    }
  }
  
  # track convergence on these samples
  subsetSamples <- x %>% sample_n(nrow(x)*perTrack)
  rmseValue <- computeRMSE(subsetSamples)
  if (verbose) {
    cat(sprintf('the training RMSE at iter=0 is %.9f\n', rmseValue))
  }
  
  if (trackConv) {
    rmseHist <- tibble(iter=0, K=K, rmse=rmseValue)
  } else {
    rmseHist <- NULL
  }
  
  for (iter in 1:maxIter) {
    # for performance reasons the SGD update code is duplicated in both 
    # implementations; using a function would defeat the purpose due to 
    # R's pass-by-value and the undesirable cost on copying the matrices
    if (ncores == 1) {
      for (iter2 in 1:batchIter) {
        # choose a random batch of samples
        samples <- x %>% sample_n(batchSize)
        
        # get hold of the indexes
        i <- samples$i
        j <- samples$j
        
        # compute the residuals
        epsilon <- samples$rating_z - colSums(P[,i]*Q[,j])
        
        # partial derivatives w.r.t. P and Q
        P_upd <- (P[,i] + gamma*(Q[,j]*epsilon[col(Q[,j])] - lambda*P[,i]))
        Q[,j] <- (Q[,j] + gamma*(P[,i]*epsilon[col(P[,i])] - lambda*Q[,j]))
        P[,i] <- P_upd
      }
    } else {
      stopifnot(nrow(blocks) == ncores^2)
      
      # pick a random block group
      g <- sample(0:(ncores-1), 1)

      # process the selected block group
      res <- mclapply((g*ncores + 1):((g + 1)*ncores), mc.cores = ncores, mc.set.seed = TRUE, 
        FUN = function(b) {
          # select the subset of samples corresponding to this block
          blockSamples <- x %>%
            filter(bi == blocks[b,]$bi & bj == blocks[b,]$bj)
          stopifnot(nrow(blockSamples) > 0)
          
          # keep track of the updated columns
          ii <- NULL
          jj <- NULL
          
          # run multiple batches on this block
          for (iter2 in 0:(parIter-1)) {
            samples <- blockSamples %>% sample_n(batchSize)

            # get hold of the indexes
            i <- samples$i
            j <- samples$j

            # compute the residuals
            epsilon <- samples$rating_z - colSums(P[,i]*Q[,j])
            
            # partial derivatives w.r.t. P and Q
            P_upd <- (P[,i] + gamma*(Q[,j]*epsilon[col(Q[,j])] - lambda*P[,i]))
            Q[,j] <- (Q[,j] + gamma*(P[,i]*epsilon[col(P[,i])] - lambda*Q[,j]))
            P[,i] <- P_upd
            
            # accumulate the changed indexes
            if (is.null(ii)) {
              ii <- i
            } else {
              ii <- c(ii, i)
            }
            
            if (is.null(jj)) {
              jj <- j
            } else {
              jj <- c(jj, j)
            }
          }
          
          ii <- sort(unique(ii))
          jj <- sort(unique(jj))
          
          # output the updated user and skill columns
          return(list(Pii=P[,ii],Qjj=Q[,jj],ii=ii,jj=jj))
        })
      
      # consolidate updates into the P and Q matrices
      for (k in 1:length(res)) {
        l <- res[[k]]
        P[,l$ii] <- l$Pii
        Q[,l$jj] <- l$Qjj
      }      
    }
    
    # check rmse
    rmsePrevious <- rmseValue
    rmseValue    <- computeRMSE(subsetSamples)

    # track convergence at a number of steps
    if (trackConv && iter %% iterBreaks == 0) {
      if (verbose) {
        cat(sprintf('the training RMSE at iter=%d is %.9f\n', iter, rmseValue))
      }
      rmseHist <- rmseHist %>% 
        add_row(iter=iter, K=K, rmse=rmseValue)
    }

    # check whether the rmse improved, if not then halve gamma
    if (rmsePrevious < rmseValue) {
      gamma <- gamma / 2
      if (verbose) {
        cat(sprintf("decreased the learning rate to: %.9f\n", gamma))
      }
    } else {
      if (gamma < param$maxGamma) {
        # increase the learning rate more slowly 
        gamma <- min(param$maxGamma, gamma*3/2)
        if (verbose) {
          cat(sprintf("increased the learning rate to: %.9f\n", gamma))
        }
      }
    }

    # if threshold ratio exceeded then bounce gamma back to previous value
    if (param$maxGamma / gamma > thresRatio) {
      gamma <- gamma*2
      if (verbose) {
        cat(sprintf("bounced the learning rate to: %.9f\n", gamma))
      }
    }
  }

  # return the model fit as a list
  return(list(globalMu=globalMu,
              globalSigma=globalSigma,
              skillEffects=skillEffects,
              userEffects=userEffects,
              userIndex=userIndex,
              skillIndex=skillIndex,
              P=P,
              Q=Q,
              rmseHist=rmseHist,
              params=param))
}

# Define the predict function that produces a vector of predictions
lrmf$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) {
  # check whether we have a correct newdata
  stopifnot("userId" %in% colnames(newdata))
  stopifnot("skill"  %in% colnames(newdata))
  
  # link the newdata to the indexes per user and skill
  newdata <- newdata %>% 
    left_join(modelFit$userIndex , by="userId") %>%
    left_join(modelFit$skillIndex, by="skill") %>%
    select(i, j)
  
  predicted <- modelFit$globalMu + modelFit$globalSigma*
    colSums(modelFit$P[,newdata$i]*modelFit$Q[,newdata$j])
  return(predicted)
}

##########################################################################################
## Build a representative calibration (cross validation) subset of the training set.
##########################################################################################

N <- 240
tic(sprintf("preparing a calibration set of %d random users", N))
# set the seed again
portable.set.seed(1)
usersSel <- trainSet %>% 
  group_by(rating) %>%
  select(userId, rating) %>% 
  unique() %>% 
  sample_n(N / 8)
# all ratings for those users
calibrationSet <- trainSet %>%
  semi_join(usersSel, by="userId")
toc()
rm(usersSel)

# how many rows, distinct skills and users in the calibration set?
cat(sprintf("The calibration set contains %d rows, %d unique users and %d skills\n", 
            nrow(calibrationSet),
            length(unique(calibrationSet$userId)), 
            length(unique(calibrationSet$skill))))

# check the ratings distribution of the calibration set
calibrationSet %>% 
  group_by(rating) %>% 
  count()

calibrationSet %>% 
  group_by(skill) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

calibrationSet %>% 
  group_by(userId) %>%
  summarise(n = n()) %>%
  arrange(desc(n))

##########################################################################################
## Calibrate the "lrmf" model on the calibration set (subset of the training set). Here 
## we look for the best hyper-parameters that fit the model on a small subset of the 
## training data.
##########################################################################################

tic('calibrating the LRMF model')
# set the seed again
portable.set.seed(1)
control <- trainControl(method = "cv",
                        search = "grid",
                        number = 10,        # use 10 K-folds in cross validation
                        p = .9,             # use 90% of training and 10% for testing
                        allowParallel = T,  # execute CV folds in parallel
                        verboseIter = T)
cvFit <- train(x = calibrationSet,
               y = calibrationSet$rating,
               method = lrmf,
               trControl = control,
               ncores = 1,
               maxIter = 100,
               batchIter = 50,
               batchSize = 100)
toc()

## The bestTune model found is:
stopifnot(cvFit$bestTune$K == 11)
stopifnot(cvFit$bestTune$maxGamma == 0.05)
stopifnot(cvFit$bestTune$lambda == 0.1)
stopifnot(cvFit$bestTune$sigma == 0.15)

##########################################################################################
## Fit the best model found to the complete training set.
##########################################################################################

tic('training LRMF on the full training set - parallel')
# set the seed again
portable.set.seed(1)
fitPar <- train(x = trainSet,
                y = trainSet$rating,
                method = lrmf,
                trControl = trainControl(method = "none"),
                tuneGrid = cvFit$bestTune,
                trackConv = T,
                iterBreaks = 1,
                verbose = F,
                ncores = ncores, 
                maxIter = 125,
                batchIter = 216)
ticTocTimes <- toc()
elapsedPar <- ticTocTimes$toc[[1]] - ticTocTimes$tic[[1]]

# save the RMSE history
rmseHist <- fitPar$finalModel$rmseHist %>% 
  add_column(method=sprintf("Parallel - %d cores", ncores))

tic('training LRMF on the full training set - classic')
# set the seed again
portable.set.seed(1)
fitSeq <- train(x = trainSet,
                y = trainSet$rating,
                method = lrmf,
                trControl = trainControl(method = "none"),
                tuneGrid = cvFit$bestTune,
                trackConv = T,
                iterBreaks = 1,
                verbose = F,
                ncores = 1, 
                maxIter = 250,
                batchIter = 108)
ticTocTimes <- toc()
elapsedSeq <- ticTocTimes$toc[[1]] - ticTocTimes$tic[[1]]

# save the RMSE history
rmseHist <- rmseHist %>%
  bind_rows(fitSeq$finalModel$rmseHist %>% 
              add_column(method="Classic"))

# plot the convergence for the model on the full training data
colorSpec <- c("salmon", "turquoise3")
names(colorSpec) <- c("Classic", sprintf("Parallel - %d cores", ncores))
rmseHist %>%
  ggplot(aes(iter, rmse, color=method, group=method)) + 
  geom_point(aes(shape=method), size=2) + 
  geom_line() +
  scale_colour_manual(values = colorSpec) +  
  theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=12)) + 
  xlab("Iterations") + ylab("RMSE") +
  annotate("text", x = 125, colour = colorSpec[2], 
           y = rmseHist %>% 
             filter(method == sprintf("Parallel - %d cores", ncores)) %>% 
             last() %>% 
             pull(rmse) - 0.004,
           label = sprintf("%.2f sec", elapsedPar)) + 
  annotate("text", x = 250, colour = colorSpec[1],
           y = rmseHist %>% 
             filter(method == "Classic") %>% 
             last() %>% 
             pull(rmse) - 0.004, 
           label = sprintf("%.2f sec", elapsedSeq)) + 
  ggtitle("Users-skills rating prediction using LRMF - Parallel vs. classic method")

##########################################################################################
## Compute the RMSE for the model on the test set.
##########################################################################################

## TEST SET ACCESS ALERT! accessing the test set to compute RMSE.
predictedRatings <- predict(fitPar, testSet)
rmseValue <- Metrics::rmse(predictedRatings, testSet$rating)
cat(sprintf("RMSE on test data is %.9f\n", rmseValue))
# check that we get reproducible results
stopifnot(abs(rmseValue - 0.659181944) < 1e-9)

## TEST SET ACCESS ALERT! accessing the test set to compute RMSE.
predictedRatings <- predict(fitSeq, testSet)
rmseValue <- Metrics::rmse(predictedRatings, testSet$rating)
cat(sprintf("RMSE on test data is %.9f\n", rmseValue))
# check that we get reproducible results
stopifnot(abs(rmseValue - 0.650566815) < 1e-9)

# using only ~0.5% of the ratings data
percData <- 100*(125*216)/nrow(ratings)
percData
stopifnot(abs(percData - 0.4960501) < 1e-7)

##########################################################################################
## Finally predict how good I'd be predicted to be on skills for which there is no evidence.
##########################################################################################

# where is the author top rated?
ratings %>% 
  filter(userId == 1142881) %>%
  arrange(desc(rating))

# find the skills for which I have no ratings i.e. there is no evidence
noEvidenceSkills <- mainSkills %>%
  anti_join(ratings %>% filter(userId == 1142881) %>% select(skill) %>% unique(), by="skill") %>%
  arrange(desc(count))
noEvidenceSkills

# compute the ratings average for each of those skills
avgNoEvidenceSkills <- ratings %>%
  group_by(skill) %>%
  summarise(avg=mean(rating)) %>%
  semi_join(noEvidenceSkills, by="skill")
avgNoEvidenceSkills

# create new data for prediction
newdata <- noEvidenceSkills %>%
  select(skill, count) %>%
  mutate(userId=1142881) %>%
  inner_join(avgNoEvidenceSkills, by="skill") %>%
  select(userId, skill, count, avg)

# compute skill rating predictions
newdata$predicted <- predict(fitSeq, newdata)

# show how would be rated for the following technologies
newdata %>% 
  filter(skill %in% c("tableau", "google-maps", "c++11", "c++17", 
                      "ejb", "java-stream", "teradata", "itext",
                      "blockchain", "apache-kafka", "haskell", "go")) %>%
  arrange(desc(predicted))

# show the top 20 skills where the predicted rating is above average
newdata %>% 
  filter(predicted > avg) %>% 
  top_n(20, predicted) %>%
  arrange(desc(predicted))