#=============================================================================
# settings

options(scipen = 20)

#--------------------------------------------------
# packages

library(data.table)
library(ggplot2)
library(stringr)

#=============================================================================
# Helper methods

sigmoid <- function(x){
  # sigmoid function

  1 / (1 + exp(-x))
}

softmax <- function(m, offset_trick = TRUE){
  # softmax

  if(offset_trick){
    rowmaxs <- apply(m, 1, max)
    result <- exp(m - rowmaxs)/rowSums(exp(m - rowmaxs))
  } else{
    result <- exp(m)/rowSums(exp(m))
  }

  return(result)
}

#=============================================================================
# Datasets

# Read train & test data
train <- fread("./Data/train.csv")
test <- fread("./Data/test.csv")

# Insert Label for plots
train[, Label := ifelse(IsStairs, "Stairs", "Not Stairs")]
train[, Label := factor(Label, levels=c("Not Stairs", "Stairs"))]
test[, Label := ifelse(IsStairs, "Stairs", "Not Stairs")]
test[, Label := factor(Label, levels=c("Not Stairs", "Stairs"))]

# Reshape from wide to tall
trainTall <- melt(
  train,
  id.vars = c("ImageId", "IsStairs", "Label"),
  measure.vars = c("R1C1", "R1C2", "R2C1", "R2C2"),
  value.name = "Intensity",
  variable.name = "Pixel"
)

testTall <- melt(
  test,
  id.vars = c("ImageId", "IsStairs", "Label"),
  measure.vars = c("R1C1", "R1C2", "R2C1", "R2C2"),
  value.name = "Intensity",
  variable.name = "Pixel"
)

trainTall[, `:=`(Row = as.integer(str_extract(Pixel, "(?<=R)\\d")), Col = as.integer(str_extract(Pixel, "(?<=C)\\d")))]
testTall[, `:=`(Row = as.integer(str_extract(Pixel, "(?<=R)\\d")), Col = as.integer(str_extract(Pixel, "(?<=C)\\d")))]

#=============================================================================
# Figures

#--------------------------------------------------
# fig1: stairs vs not stairs

fig1data <- trainTall[ImageId %in% train$ImageId[1:12]]
fig1 <- ggplot(fig1data, aes(x = Col, y = 2-Row, fill = Intensity))+geom_tile(color="white")+
  scale_fill_gradient(low = "white", high = "black", limits=c(0, 255), guide=FALSE)+
  theme_void()+labs(x="", y="")+
  facet_wrap(~ImageId, nrow = 2, labeller = labeller(ImageId = setNames(as.character(fig1data$Label), fig1data$ImageId)))

#--------------------------------------------------
# fig2: image with pixels labeled x1, x2, x3, x4

image1 <- trainTall[ImageId == 1]
image1[Row == 1 & Col == 1, PixelLabel := paste0("x1 = ", Intensity)]
image1[Row == 1 & Col == 2, PixelLabel := paste0("x2 = ", Intensity)]
image1[Row == 2 & Col == 1, PixelLabel := paste0("x3 = ", Intensity)]
image1[Row == 2 & Col == 2, PixelLabel := paste0("x4 = ", Intensity)]

fig2lables <- image1
fig2lables[, `:=`(x = Col+0.45, y = 1.55 - Row)]

fig2 <- ggplot(image1, aes(x = Col, y = 2-Row, fill = Intensity))+geom_tile(color="white")+
  geom_text(data = fig2lables, aes(x = x, y = y, label = PixelLabel), color = "goldenrod3", size = 5, hjust = 1, vjust = 0)+
  scale_fill_gradient(low = "white", high = "black", limits=c(0, 255), guide=FALSE)+
  theme_void()+labs(x="", y="")+
  labs(title = image1$Label[1])+
  theme(plot.title = element_text(hjust = 0.5, vjust = -50))

#=============================================================================
# NNet from scratch

# Build a nnet with and input layer, hidden layer, and output layer
# input layer: 1st node = 1 (for bias), nodes 2-4 correspond to features R1C1, R1C2, R2C1, R2C2
# hidden layer: 3 nodes. 1st node = 1 (for bias), nodes 2-3 correspond to incoming signals
# output layer: 2 nodes
# Optimize categorical cross entropy error
# Apply sigmoid activation to the hidden layer, softmax to the output layer

#--------------------------------------------------
# Set Y, X1, stepsize

Y <- cbind(Stairs = 1 * train$IsStairs, NotStairs = 1 * !train$IsStairs)
X1 <- as.matrix(train[, list(R1C1, R1C2, R2C1, R2C2)])

# Prepend X1 with column of 1s for bias terms
X1 <- cbind(b = 1, X1)  # Include column of all 1s

# Set setpsize
stepsize = 0.1

#--------------------------------------------------
# Weight initialization

set.seed(1)
W1 <- matrix(runif(min = -0.01, max = 0.01, n = 5*2), nrow=5)
W2 <- matrix(runif(min = -0.01, max = 0.01, n = 3*2), nrow=3)

#--------------------------------------------------
# Forward Pass (first iteration)

Z1 <- X1 %*% W1
X2 <- cbind(b = 1, sigmoid(Z1))
Z2 <- X2 %*% W2
Yhat <- softmax(Z2)

# Calculate Categorical Cross Entropy Error
loss <- mean(-rowSums(Y * log(Yhat)))

# Measure loss and classification accuracy
predicted_class <- apply(Yhat, MARGIN = 1, FUN = which.max)
true_class <- apply(Y, MARGIN = 1, FUN = which.max)
accuracy <- mean(predicted_class == true_class)
print(paste("Loss:", loss, "| accuracy:", accuracy))

#--------------------------------------------------
# Backprop (first iteration)

# Backprop
delta1 <- Yhat - Y  # Partial CE/Partial Z2
delta2 <- delta1 %*% t(W2)  # Partial CE/Partial X2
delta3 <- delta2[, -1] * (X2[, -1] * (1 - X2[, -1]))  # Partial CE/Partial Z1

# Gradients
gradW2 <- t(X2) %*% delta1 / nrow(X1)
gradW1 <- t(X1) %*% delta3 / nrow(X1)

# Weight updates
W1 <- W1 - gradW1 * stepsize
W2 <- W2 - gradW2 * stepsize

# Measure loss and classification accuracy
predicted_class <- apply(Yhat, MARGIN = 1, FUN = which.max)
true_class <- apply(Y, MARGIN = 1, FUN = which.max)
accuracy <- mean(predicted_class == true_class)
print(paste("Loss:", loss, "| accuracy on training data:", accuracy))

#--------------------------------------------------
# Repeat

epochs <- 1500
for(epoch in 2:epochs){

  # Forward Pass
  Z1 <- X1 %*% W1
  X2 <- cbind(b = 1, sigmoid(Z1))
  Z2 <- X2 %*% W2
  Yhat <- softmax(Z2)

  # Calculate Categorical Cross Entropy Error
  loss <- mean(-rowSums(Y * log(Yhat)))

  # Backprop
  delta1 <- Yhat - Y  # Partial CE/Partial Z2
  delta2 <- delta1 %*% t(W2[-1, ])  # Partial CE/Partial X2[, -1]
  delta3 <- delta2 * (X2[, -1] * (1 - X2[, -1]))  # Partial CE/Partial Z1

  # Gradients
  gradW2 <- t(X2) %*% delta1 / 400
  gradW1 <- t(X1) %*% delta3 / 400

  # Weight updates
  W1 <- W1 - gradW1 * stepsize
  W2 <- W2 - gradW2 * stepsize

  # Print the loss and accuracy
  if((epoch - 2) %% 100 == 0){
    predicted_class <- apply(Yhat, MARGIN = 1, FUN = which.max)
    true_class <- apply(Y, MARGIN = 1, FUN = which.max)
    accuracy <- mean(predicted_class == true_class)

    print(paste("Epoch:", epoch, "Loss:", loss, "| accuracy on training data:", accuracy))
  }
}

#--------------------------------------------------
# Make predictions on the test data

Y_test <- cbind(Stairs = 1 * test$IsStairs, NotStairs = 1 * !test$IsStairs)
X1_test <- as.matrix(test[, list(R1C1, R1C2, R2C1, R2C2)])

# Prepend X1 with column of 1s for bias terms
X1_test <- cbind(b = 1, X1_test)  # Include column of all 1s

# Forward pass
Z1_test <- X1_test %*% W1
X2_test <- cbind(b = 1, sigmoid(Z1_test))
Z2_test <- X2_test %*% W2
Yhat_test <- softmax(Z2_test)

# Check results
predicted_class <- apply(Yhat_test, MARGIN = 1, FUN = which.max)
true_class <- apply(Y_test, MARGIN = 1, FUN = which.max)
accuracy <- mean(predicted_class == true_class)
print(paste("accuracy on test data:", accuracy))  # 0.87

##################################################################################################################################
# Check results using keras

library(keras)

Y <- cbind(Stairs = 1 * train$IsStairs, NotStairs = 1 * !train$IsStairs)
X1 <- as.matrix(train[, list(R1C1, R1C2, R2C1, R2C2)])

set.seed(1)
W1 <- matrix(runif(min = -0.01, max = 0.01, n = 5*2), nrow=5)
W2 <- matrix(runif(min = -0.01, max = 0.01, n = 3*2), nrow=3)

model <- keras_model_sequential()
model <- layer_dense(
  object = model,
  input_shape = 4L,
  use_bias = TRUE,
  units = 2L,
  activation = 'sigmoid',
  weights = list(W1[2:5,], array(W1[1,]))
)
model <- layer_dense(
  object = model,
  input_shape = 2L,
  use_bias = TRUE,
  units = 2L,
  activation = 'softmax',
  weights = list(W2[2:3,], as.array(W2[1,]))
)
model <- compile(
  object = model,
  loss = 'categorical_crossentropy',
  optimizer = optimizer_sgd(lr=0.1),
  metrics = c('accuracy')
)

# Check the weights
get_weights(model)

# [[1]]
# [,1]         [,2]
# [1,] -0.002557522  0.008893506
# [2,]  0.001457067  0.003215956
# [3,]  0.008164156  0.002582281
# [4,] -0.005966362 -0.008764274
#
# [[2]]
# [1] -0.004689827  0.007967793
#
# [[3]]
# [,1]           [,2]
# [1,] -0.006468865  0.00539682852
# [2,]  0.003740457 -0.00004601516
#
# [[4]]
# [1] -0.005880509 -0.002317926

# Check the first few predictions from the initial nnet before training
head(predict_on_batch(object = model, x = X1))
#           [,1]      [,2]
# [1,] 0.4986581 0.5013419
# [2,] 0.4982750 0.5017250
# [3,] 0.4974762 0.5025238
# [4,] 0.4982801 0.5017199
# [5,] 0.4977485 0.5022514
# [6,] 0.4986086 0.5013914

# Evaluate initial model (our model has loss = 0.6934985)
evaluate(object = model, x = X1, y = Y, batch_size = 400)  # 0.6934986

# Run one iteration of gradient descent
train_on_batch(object = model, x = X1, y = Y)

# Check the weights again
get_weights(model)

# [[1]]
#              [,1]          [,2]
# [1,] -0.002025512  0.0087154731
# [2,]  0.002760389  0.0032557009
# [3,]  0.003623125  0.0003690708
# [4,] -0.010618099 -0.0107602663
#
# [[2]]
# [1] -0.004693955  0.007963512
#
# [[3]]
#                [,1]        [,2]
# [1,] -0.00490462780 0.003832591
# [2,]  0.00003715511 0.003657288
#
# [[4]]
# [1] -0.004931618 -0.003266816