neural_style_transfer • keras

Source: https://github.com/rstudio/keras/blob/master/vignettes/examples/neural_style_transfer.R

Neural style transfer with Keras.

It is preferable to run this script on a GPU, for speed.

Example result: https://twitter.com/fchollet/status/686631033085677568

Style transfer consists in generating an image with the same “content” as a base image, but with the “style” of a different picture (typically artistic).

This is achieved through the optimization of a loss function that has 3 components: “style loss”, “content loss”, and “total variation loss”:

The total variation loss imposes local spatial continuity between the pixels of the combination image, giving it visual coherence.
The style loss is where the deep learning keeps in –that one is defined using a deep convolutional neural network. Precisely, it consists in a sum of L2 distances between the Gram matrices of the representations of the base image and the style reference image, extracted from different layers of a convnet (trained on ImageNet). The general idea is to capture color/texture information at different spatial scales (fairly large scales –defined by the depth of the layer considered).
The content loss is a L2 distance between the features of the base image (extracted from a deep layer) and the features of the combination image, keeping the generated image close enough to the original one.

library(keras)
library(purrr)
library(R6)

# Parameters --------------------------------------------------------------

base_image_path <- "neural-style-base-img.png"
style_reference_image_path <- "neural-style-style.jpg"
iterations <- 10

# these are the weights of the different loss components
total_variation_weight <- 1
style_weight <- 1
content_weight <- 0.025

# dimensions of the generated picture.
img <- image_load(base_image_path)
width <- img$size[[1]]
height <- img$size[[2]]
img_nrows <- 400
img_ncols <- as.integer(width * img_nrows / height)


# Functions ---------------------------------------------------------------

# util function to open, resize and format pictures into appropriate tensors
preprocess_image <- function(path){
  img <- image_load(path, target_size = c(img_nrows, img_ncols)) %>%
    image_to_array() %>%
    array_reshape(c(1, dim(.)))
  imagenet_preprocess_input(img)
}

# util function to convert a tensor into a valid image
# also turn BGR into RGB.
deprocess_image <- function(x){
  x <- x[1,,,]
  # Remove zero-center by mean pixel
  x[,,1] <- x[,,1] + 103.939
  x[,,2] <- x[,,2] + 116.779
  x[,,3] <- x[,,3] + 123.68
  # BGR -> RGB
  x <- x[,,c(3,2,1)]
  # clip to interval 0, 255
  x[x > 255] <- 255
  x[x < 0] <- 0
  x[] <- as.integer(x)/255
  x
}


# Defining the model ------------------------------------------------------

# get tensor representations of our images
base_image <- k_variable(preprocess_image(base_image_path))
style_reference_image <- k_variable(preprocess_image(style_reference_image_path))

# this will contain our generated image
combination_image <- k_placeholder(c(1, img_nrows, img_ncols, 3))

# combine the 3 images into a single Keras tensor
input_tensor <- k_concatenate(list(base_image, style_reference_image, 
                                   combination_image), axis = 1)

# build the VGG16 network with our 3 images as input
# the model will be loaded with pre-trained ImageNet weights
model <- application_vgg16(input_tensor = input_tensor, weights = "imagenet", 
                           include_top = FALSE)

print("Model loaded.")

nms <- map_chr(model$layers, ~.x$name)
output_dict <- map(model$layers, ~.x$output) %>% set_names(nms)

# compute the neural style loss
# first we need to define 4 util functions

# the gram matrix of an image tensor (feature-wise outer product)

gram_matrix <- function(x){
  
  features <- x %>%
    k_permute_dimensions(pattern = c(3, 1, 2)) %>%
    k_batch_flatten()
  
  k_dot(features, k_transpose(features))
}

# the "style loss" is designed to maintain
# the style of the reference image in the generated image.
# It is based on the gram matrices (which capture style) of
# feature maps from the style reference image
# and from the generated image

style_loss <- function(style, combination){
  S <- gram_matrix(style)
  C <- gram_matrix(combination)
  
  channels <- 3
  size <- img_nrows*img_ncols
  
  k_sum(k_square(S - C)) / (4 * channels^2  * size^2)
}

# an auxiliary loss function
# designed to maintain the "content" of the
# base image in the generated image

content_loss <- function(base, combination){
  k_sum(k_square(combination - base))
}

# the 3rd loss function, total variation loss,
# designed to keep the generated image locally coherent

total_variation_loss <- function(x){
  y_ij  <- x[,1:(img_nrows - 1L), 1:(img_ncols - 1L),]
  y_i1j <- x[,2:(img_nrows), 1:(img_ncols - 1L),]
  y_ij1 <- x[,1:(img_nrows - 1L), 2:(img_ncols),]
  
  a <- k_square(y_ij - y_i1j)
  b <- k_square(y_ij - y_ij1)
  k_sum(k_pow(a + b, 1.25))
}

# combine these loss functions into a single scalar
loss <- k_variable(0.0)
layer_features <- output_dict$block4_conv2
base_image_features <- layer_features[1,,,]
combination_features <- layer_features[3,,,]

loss <- loss + content_weight*content_loss(base_image_features, 
                                           combination_features)

feature_layers = c('block1_conv1', 'block2_conv1',
                  'block3_conv1', 'block4_conv1',
                  'block5_conv1')

for(layer_name in feature_layers){
  layer_features <- output_dict[[layer_name]]
  style_reference_features <- layer_features[2,,,]
  combination_features <- layer_features[3,,,]
  sl <- style_loss(style_reference_features, combination_features)
  loss <- loss + ((style_weight / length(feature_layers)) * sl)
}

loss <- loss + (total_variation_weight * total_variation_loss(combination_image))

# get the gradients of the generated image wrt the loss
grads <- k_gradients(loss, combination_image)[[1]]

f_outputs <- k_function(list(combination_image), list(loss, grads))

eval_loss_and_grads <- function(image){
  image <- array_reshape(image, c(1, img_nrows, img_ncols, 3))
  outs <- f_outputs(list(image))
  list(
    loss_value = outs[[1]],
    grad_values = array_reshape(outs[[2]], dim = length(outs[[2]]))
  )
}

# Loss and gradients evaluator.
# 
# This Evaluator class makes it possible
# to compute loss and gradients in one pass
# while retrieving them via two separate functions,
# "loss" and "grads". This is done because scipy.optimize
# requires separate functions for loss and gradients,
# but computing them separately would be inefficient.
Evaluator <- R6Class(
  "Evaluator",
  public = list(
    
    loss_value = NULL,
    grad_values = NULL,
    
    initialize = function() {
      self$loss_value <- NULL
      self$grad_values <- NULL
    },
    
    loss = function(x){
      loss_and_grad <- eval_loss_and_grads(x)
      self$loss_value <- loss_and_grad$loss_value
      self$grad_values <- loss_and_grad$grad_values
      self$loss_value
    },
    
    grads = function(x){
      grad_values <- self$grad_values
      self$loss_value <- NULL
      self$grad_values <- NULL
      grad_values
    }
    
  )
)

evaluator <- Evaluator$new()

# run scipy-based optimization (L-BFGS) over the pixels of the generated image
# so as to minimize the neural style loss
dms <- c(1, img_nrows, img_ncols, 3)
x <- array(data = runif(prod(dms), min = 0, max = 255) - 128, dim = dms)

# Run optimization (L-BFGS) over the pixels of the generated image
# so as to minimize the loss
for(i in 1:iterations){

  # Run L-BFGS
  opt <- optim(
    array_reshape(x, dim = length(x)), fn = evaluator$loss, gr = evaluator$grads, 
    method = "L-BFGS-B",
    control = list(maxit = 15)
  )
  
  # Print loss value
  print(opt$value)
  
  # decode the image
  image <- x <- opt$par
  image <- array_reshape(image, dms)
  
  # plot
  im <- deprocess_image(image)
  plot(as.raster(im))
}