# ---
# "Useful one-function R packages, big data solutions, and a message from Yoda"
# Eduardo García-Portugués
# ---

### Abstract

# As the title reads, in this heterogeneous session we will see three topics of 
# different interest. The first is a collection of three simple and useful 
# one-function R packages that I use regularly in my coding workflow. The 
# second collects some approaches to handling and performing linear regression 
# with big data. The third brings in the freaky component: it presents tools to 
# display graphical information in plain ASCII, from bivariate contours to 
# messages from Yoda!

### Required packages

install.packages(c("viridis", "microbenchmark", "multcomp", "manipulate", 
                   "ffbase", "biglm", "leaps", "txtplot", "NostalgiR",
                   "cowsay"), 
                 dependencies = TRUE)

# 1. Some simple and useful R packages

## 1.2 Color palettes with viridis

### Chunk 1

# MATLAB's color palette
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan",
                                 "#7FFF7F", "yellow", "#FF7F00", "red", 
                                 "#7F0000"))
# Plot for palettes
testPalette <- function(col, n = 200, ...) {
  image(1:n, 1, as.matrix(1:n), col = get(col)(n = n, ...), 
        main = col, xlab = "", ylab = "", axes = FALSE)
}

# Color palettes comparison
par(mfrow = c(2, 3))
res <- sapply(c("rainbow", "topo.colors", "terrain.colors", 
                "heat.colors", "cm.colors", "jet.colors"), testPalette)


### Chunk 2

library(viridisLite)
# Color palettes comparison
par(mfrow = c(2, 3))
res <- sapply(c("viridis", "magma", "inferno", "plasma", "cividis"), 
              testPalette)

### Chunk 3

# Reverse palette
par(mfrow = c(1, 2))
testPalette("viridis", direction = 1)
testPalette("viridis", direction = -1)

# Truncate color range
par(mfrow = c(1, 2))
testPalette("viridis", begin = 0, end = 1)
testPalette("viridis", begin = 0.25, end = 0.75)

# Asjust transparency
par(mfrow = c(1, 2))
testPalette("viridis", alpha = 1)
testPalette("viridis", alpha = 0.5)

### Chunk 4

library(viridis)
library(ggplot2)

# Using scale_color_viridis
ggplot(mtcars, aes(wt, mpg)) + 
  geom_point(size = 4, aes(color = factor(cyl))) +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

# Using scale_fill_viridis
dat <- data.frame(x = rnorm(1e4), y = rnorm(1e4))
ggplot(dat, aes(x = x, y = y)) +
  geom_hex() + coord_fixed() +
  scale_fill_viridis() + theme_bw()

## 1.2 Benchmarking with microbenchmark

### Chunk 1

# Using proc.time
time <- proc.time()
for (i in 1:100) rnorm(100)
time <- proc.time() - time
time # elapsed is the 'real' elapsed time since the process was started

# Using system.time - a wrapper for the above code
system.time({for (i in 1:100) rnorm(100)})

### Chunk 2

# Data and mean
n <- 3
m <- 10
X <- matrix(1:(n * m), nrow = n, ncol = m)
mu <- colMeans(X)
# We assume mu is given in the time comparisons

### Chunk 3

# Some approaches:

# 1) sweep
Y1 <- sweep(x = X, MARGIN = 2, STATS = mu, FUN = "-")

# 2) apply (+ transpose to maintain the arrangement of X)
Y2 <- t(apply(X, 1, function(x) x - mu))

# 3) scale
Y3 <- scale(X, center = mu, scale = FALSE)

# 4) loop
Y4 <- matrix(0, nrow = n, ncol = m)
for (j in 1:m) Y4[, j] <- X[, j] - mu[j]
  
# 5) magic (my favourite!)
Y5 <- t(t(X) - mu)

### Chunk 4

# Test speed
library(microbenchmark)
bench <- microbenchmark(
  sweep(x = X, MARGIN = 2, STATS = mu, FUN = "-"), 
  t(apply(X, 1, function(x) x - mu)), 
  scale(X, center = mu, scale = FALSE), 
  {for (j in 1:m) Y4[, j] <- X[, j] - mu[j]; Y4},
  t(t(X) - mu)
)

# Printing a microbenchmark object gives a numerical summary
bench
# Notice the last column. It is only present if the package multcomp is present.
# It provides a statistical ranking (accounts for which method is significantly 
# slower or faster) using multcomp::cld
 
# Adjusting display
print(bench, unit = "ms", signif = 2)
# unit = "ns" (nanosecs), "us" ("microsecs"), "ms" (millisecs), "s" (secs)

# Graphical methods for the microbenchmark object
# Raw time data
par(mfrow = c(1, 1))
plot(bench, names = c("sweep", "apply", "scale", "for", "recycling"))
# Employs time logarithms
boxplot(bench, names = c("sweep", "apply", "scale", "for", "recycling")) 
# Violin plots
autoplot(bench)

# microbenchmark uses 100 evaluations and 2 warmup evaluations (these are not 
# measured) by default. Here is how to change these dafaults:
bench2 <- microbenchmark(
  sweep(x = X, MARGIN = 2, STATS = mu, FUN = "-"), 
  t(apply(X, 1, function(x) x - mu)), 
  scale(X, center = mu, scale = FALSE), 
  {for (j in 1:m) Y4[, j] <- X[, j] - mu[j]; Y4},
  t(t(X) - mu),
  times = 1000, control = list(warmup = 10)
)
bench2

# Let's check the accuracy of the methods as well as their timings. For that 
# we need to define a check function that takes as input a LIST with each slot
# representing the output of each method. The check must return TRUE or FALSE
check1 <- function (results) {
  all(sapply(results[-1], function(x) {
    max(abs(x - results[[1]]))
    }) < 1e-15) # Compares all results pair-wisely with the first
}

# Example with check function
bench3 <- microbenchmark(
  sweep(x = X, MARGIN = 2, STATS = mu, FUN = "-"), 
  t(apply(X, 1, function(x) x - mu)), 
  scale(X, center = mu, scale = FALSE), 
  {for (j in 1:m) Y4[, j] <- X[, j] - mu[j]; Y4},
  t(t(X) - mu),
  check = check1
)
bench3

## 1.3 Quick animations with manipulate

### Chunk 1

library(manipulate)

# A simple example
manipulate({
  plot(1:n, sin(2 * pi * (1:n) / 100), xlim = c(1, 100), ylim = c(-1, 1),
       type = "o", xlab = "x", ylab = "y")
  lines(1:100, sin(2 * pi * 1:100 / 100), col = 2)
}, n = slider(min = 1, max = 100, step = 1, ticks = TRUE))

### Chunk 2

# Illustrating several types of controls using the kernel density estimator
manipulate({
  
  # Sample data
  if (newSample) {
    set.seed(sample(1:1000))
  } else {
    set.seed(12345678)
  }
  samp <- rnorm(n = n, mean = 0, sd = 1.5) 
  
  # Density estimate
  plot(density(x = samp, bw = h, kernel = kernel), 
       xlim = c(-x.rang, x.rang)/2, ylim = c(0, y.max))
  
  # Show data
  if (rugSamp) {
    rug(samp)
  }
  
  # Add reference density
  if (realDensity) {
    tt <- seq(-x.rang/2, x.rang/2, l = 200)
    lines(tt, dnorm(x = tt, mean = 0, sd = 1.5), col = 2)
  }
  },
  n = slider(min = 1, max = 250, step = 5, initial = 10),
  h = slider(0.05, 2, initial = 0.5, step = 0.05),
  x.rang = slider(1, 10, initial = 6, step = 0.5),
  y.max = slider(0.1, 2, initial = 0.5, step = 0.1),
  kernel = picker("gaussian", "epanechnikov", "rectangular", 
                  "triangular", "biweight", "cosine", "optcosine"),
  rugSamp = checkbox(TRUE, "Show rug"),
  realDensity = checkbox(TRUE, "Draw real density"),
  newSample = button("New sample!")
)

### Chunk 3

# Another example: rotating 3D graphs without using rgl

# Mexican hat
x <- seq(-10, 10, l = 50)
y <- x
f <- function(x, y) {r <- sqrt(x^2 + y^2); 10 * sin(r)/r}
z <- outer(x, y, f)

# Colors
nrz <- nrow(z)
ncz <- ncol(z)
zfacet <- z[-1, -1] + z[-1, -ncz] + z[-nrz, -1] + z[-nrz, -ncz]
col <- viridis(100)
facetcol <- cut(zfacet, length(col))
col <- col[facetcol]

# 3D plot
manipulate(
  persp(x, y, z, theta = theta, phi = phi, expand = 0.5, col = col,
        ticktype = "detailed", xlab = "X", ylab = "Y", zlab = "Sinc(r)"),
  theta = slider(-180, 180, initial = 30, step = 5),
  phi = slider(-90, 90, initial = 30, step = 5))

# 2. Handling big data in R

## 2.1 The ff and ffbase packages

### Chunk 1

# Not really "big data", but for the sake of illustration
set.seed(12345)
n <- 1e6
p <- 10
beta <- seq(-1, 1, length.out = p)^5

# Data for linear regression
x1 <- matrix(rnorm(n * p), nrow = n, ncol = p)
x1[, p] <- 2 * x1[, 1] + rnorm(n, sd = 0.1)
x1[, p - 1] <- 2 - x1[, 2] + rnorm(n, sd = 0.5)
y1 <- 1 + x1 %*% beta + rnorm(n)
bigData1 <- data.frame("resp" = y1, "pred" = x1)

# Data for logistic regression
x2 <- matrix(rnorm(n * p), nrow = n, ncol = p)
y2 <- rbinom(n = n, size = 1, prob = 1 / (1 + exp(-(1 + x2 %*% beta))))
bigData2 <- data.frame("resp" = y2, "pred" = x2)

# Sizes of objects
print(object.size(bigData1), units = "Mb")
print(object.size(bigData2), units = "Mb")

# Save files to disk to emulate the situation with big data
write.csv(x = bigData1, file = "bigData1.csv", row.names = FALSE)
write.csv(x = bigData2, file = "bigData2.csv", row.names = FALSE)

# Read files using ff
library(ffbase) # Imports ff
bigData1ff <- read.table.ffdf(file = "bigData1.csv", header = TRUE, sep = ",")
bigData2ff <- read.table.ffdf(file = "bigData2.csv", header = TRUE, sep = ",")

# Recall: the *.csv are not copied into RAM!
print(object.size(bigData1ff), units = "Kb")
print(object.size(bigData2ff), units = "Kb")

# Delete the csv files in disk
file.remove(c("bigData1.csv", "bigData2.csv"))

### Chunk 2

# Operations on ff objects "almost" as with regular data.frames
class(bigData1ff)
class(bigData1ff[, 1])
bigData1ff[1:5, 1] <- rnorm(5)
# See ?ffdf for more allowed operations

# Filter of data frames
ffwhich(bigData1ff, bigData1ff$resp > 5)

## 2.2 Regression using biglm and friends

### Chunk 1

library(biglm)
# bigglm.ffdf has a very similar syntax to glm - but the formula interface does 
# not work always as expected:
# bigglm.ffdf(formula = resp ~ ., data = bigData1ff) # Does not work
# bigglm.ffdf(formula = resp ~ pred.1 + pred.2, data = bigData1ff) # Does work, 
# but not very convenient for a large number of predictors

# Hack for automatic inclusion of all the predictors
f <- formula(paste("resp ~", paste(names(bigData1ff)[-1], collapse = " + ")))

# bigglm.ffdf's call
biglmMod <- bigglm.ffdf(formula = f, data = bigData1ff, family = gaussian())
class(biglmMod)

# lm's call
lmMod <- lm(formula = resp ~ ., data = bigData1)

# The reduction in size of the resulting object is more than notable
print(object.size(biglmMod), units = "Kb")
print(object.size(lmMod), units = "Mb")

# Summaries
s1 <- summary(biglmMod)
s2 <- summary(lmMod)
s1
s2
# The summary of a biglm object yields slightly different significances for the
# coefficients than for lm. The reason is that biglm employs N(0, 1) 
# approximations for the distributions of the t-tests instead of the exact 
# t distribution. Obviously, if n is large, the differences are inappreciable.

# Extract coefficients
coef(biglmMod)

# AIC
AIC(biglmMod, k = 2)

# Prediction works "as usual"
predict(biglmMod, newdata = bigData1[1:5, ])
# newdata must contain a column for the response!
# predict(biglmMod, newdata = bigData2[1:5, -1]) # Error

# Update the model with more training data - this is key for chunking the data
update(biglmMod, moredata = bigData1[1:100, ])

### Chunk 2

# Model selection adapted to big data models
library(leaps)
reg <- regsubsets(biglmMod, nvmax = p, method = "exhaustive")
plot(reg) # Plot best model (top row) to worst model (bottom row). Black color 
# means that the predictor is included, white stands for excluded predictor

# Get the model with lowest BIC
subs <- summary(reg)
subs$which
subs$bic
subs$which[which.min(subs$bic), ]

### Chunk 3

# Same comments for the formula framework - this is the hack for automatic
# inclusion of all the predictors
f <- formula(paste("resp ~", paste(names(bigData2ff)[-1], collapse = " + ")))

# bigglm.ffdf's call
bigglmMod <- bigglm.ffdf(formula = f, data = bigData2ff, family = binomial())

# glm's call
glmMod <- glm(formula = resp ~ ., data = bigData2, family = binomial())

# Compare sizes
print(object.size(bigglmMod), units = "Kb")
print(object.size(glmMod), units = "Mb")

# Summaries
s1 <- summary(bigglmMod)
s2 <- summary(glmMod)
s1
s2

# AIC
AIC(bigglmMod, k = 2)

# Prediction works "as usual"
predict(bigglmMod, newdata = bigData2[1:5, ], type = "response")
# predict(bigglmMod, newdata = bigData2[1:5, -1]) # Error

# 3. ASCII fun in R

## 3.1 Text-based graphs with txtplot

### Chunk 1

library(txtplot) # txt* functions

# Generate common data
x <- rnorm(100)
y <- 1 + x + rnorm(100, sd = 1)
let <- as.factor(sample(letters, size = 1000, replace = TRUE))

# txtplot
plot(x, y)
txtplot(x, y)

# txtboxplot
boxplot(x, horizontal = TRUE)
txtboxplot(x)

# txtbarchart
barplot(table(let))
txtbarchart(x = let)

# txtcurve
curve(expr = sin(x), from = 0, to = 2 * pi)
txtcurve(expr = sin(x), from = 0, to = 2 * pi)

# txtacf
acf(x)
txtacf(x)

### Chunk 2

library(NostalgiR) # nos.* functions

# Mexican hat
xx <- seq(-10, 10, l = 50)
yy <- xx
f <- function(x, y) {r <- sqrt(x^2 + y^2); 10 * sin(r)/r}
zz <- outer(xx, yy, f)

# nos.density
plot(density(x)); rug(x)
nos.density(x)

# nos.hist
hist(x)
nos.hist(x)

# nos.ecdf
plot(ecdf(x))
nos.ecdf(x)

# nos.qqnorm
qqnorm(x); qqline(x)
nos.qqnorm(x)

# nos.qqplot
qqplot(x, y)
nos.qqplot(x, y)

# nos.contour
contour(xx, yy, zz)
nos.contour(data = zz, xmin = min(xx), xmax = max(xx), 
            ymin = min(yy), ymax = max(yy))

# nos.image
image(zz, col = viridis(50))
nos.image(data = zz, xmin = min(xx), xmax = max(xx), 
          ymin = min(yy), ymax = max(yy))

## 3.2 Cute animals with cowsay

### Chunk 1

library(cowsay)

# Random fortune
set.seed(363)
say("fortune", by = "random")

# All animals
# sapply(names(animals), function(x) say(x, by = x))

# Annoy the users of your package with good old clippy
say(what = "It looks like you\'re writing a letter!", 
    by = "clippy", type = "warning")

# A message from Yoda
say("Participating in the Coding Club UC3M you must. Yes, hmmm.", by = "yoda")