--- title: "Generate Nietzsche's writing - LSTM" author: "Dr. Hua Zhou @ UCLA" date: "3/7/2019" output: html_document: toc: true toc_depth: 4 --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r} sessionInfo() ```

![](./nietzsche.jpg){width=200px}

Source: . ## Data preparation Download Nietzsche's writing from : ```{r} library(keras) library(readr) library(stringr) library(purrr) library(tokenizers) # Parameters -------------------------------------------------------------- maxlen <- 40 # Data Preparation -------------------------------------------------------- # Retrieve text path <- get_file( 'nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt' ) read_lines(path) %>% head() ``` Parse the text into character: ```{r} # Load, collapse, and tokenize text text <- read_lines(path) %>% str_to_lower() %>% str_c(collapse = "\n") %>% tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE) print(sprintf("corpus length: %d", length(text))) text[1:100] ``` Find unique characters: ```{r} chars <- text %>% unique() %>% sort() print(sprintf("total chars: %d", length(chars))) chars ``` ```{r} # Cut the text in semi-redundant sequences of maxlen characters dataset <- map( seq(1, length(text) - maxlen - 1, by = 3), ~list(sentece = text[.x:(.x + maxlen - 1)], next_char = text[.x + maxlen]) ) dataset <- transpose(dataset) dataset$sentece[[1]] dataset$sentece[[2]] dataset$sentece[[3]] dataset$next_char[[1]] dataset$next_char[[2]] dataset$next_char[[3]] ``` Turn characters into one-hot coding. Eeach sentence is represented by a 40-by-57 binary matrix. ```{r} # Vectorization X <- array(0, dim = c(length(dataset$sentece), maxlen, length(chars))) y <- array(0, dim = c(length(dataset$sentece), length(chars))) for(i in 1:length(dataset$sentece)){ X[i, , ] <- sapply(chars, function(x){ as.integer(x == dataset$sentece[[i]]) }) y[i, ] <- as.integer(chars == dataset$next_char[[i]]) } X[1, , ] y[1, ] ``` ## Model specification ```{r} # Model Definition -------------------------------------------------------- model <- keras_model_sequential() model %>% layer_lstm(units = 128, input_shape = c(maxlen, length(chars))) %>% layer_dense(length(chars)) %>% layer_activation("softmax") optimizer <- optimizer_rmsprop(lr = 0.01) model %>% compile( loss = "categorical_crossentropy", optimizer = optimizer ) summary(model) ``` ## Training and evaluate ```{r} # Training & Results ---------------------------------------------------- # large temperature means more uniform from character set sample_mod <- function(preds, temperature = 1) { preds <- log(preds) / temperature exp_preds <- exp(preds) preds <- exp_preds / sum(exp(preds)) rmultinom(1, 1, preds) %>% as.integer() %>% which.max() } system.time({ for(iteration in 1:20) { cat(sprintf("iteration: %02d ---------------\n\n", iteration)) model %>% fit( X, y, batch_size = 128, epochs = 1 ) for(diversity in c(0.2, 0.5, 1, 1.2)){ cat(sprintf("diversity: %f ---------------\n\n", diversity)) start_index <- sample(1:(length(text) - maxlen), size = 1) sentence <- text[start_index:(start_index + maxlen - 1)] generated <- "" for(i in 1:400){ x <- sapply(chars, function(x){ as.integer(x == sentence) }) x <- array_reshape(x, c(1, dim(x))) preds <- predict(model, x) next_index <- sample_mod(preds, diversity) next_char <- chars[next_index] generated <- str_c(generated, next_char, collapse = "") sentence <- c(sentence[-1], next_char) } cat(generated) cat("\n\n") } } }) ```