imdb_fasttext.Rmd
This example demonstrates the use of fasttext for text classification
Based on Joulin et al’s paper: “Bags of Tricks for Efficient Text Classification” https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings: Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 CPU Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M GPU
library(keras)
library(purrr)
# Function Definitions ----------------------------------------------------
create_ngram_set <- function(input_list, ngram_value = 2){
indices <- map(0:(length(input_list) - ngram_value), ~1:ngram_value + .x)
indices %>%
map_chr(~input_list[.x] %>% paste(collapse = "|")) %>%
unique()
}
add_ngram <- function(sequences, token_indice, ngram_range = 2){
ngrams <- map(
sequences,
create_ngram_set, ngram_value = ngram_range
)
seqs <- map2(sequences, ngrams, function(x, y){
tokens <- token_indice$token[token_indice$ngrams %in% y]
c(x, tokens)
})
seqs
}
# Parameters --------------------------------------------------------------
# ngram_range = 2 will add bi-grams features
ngram_range <- 2
max_features <- 20000
maxlen <- 400
batch_size <- 32
embedding_dims <- 50
epochs <- 5
# Data Preparation --------------------------------------------------------
# Load data
imdb_data <- dataset_imdb(num_words = max_features)
# Train sequences
print(length(imdb_data$train$x))
print(sprintf("Average train sequence length: %f", mean(map_int(imdb_data$train$x, length))))
# Test sequences
print(length(imdb_data$test$x))
print(sprintf("Average test sequence length: %f", mean(map_int(imdb_data$test$x, length))))
if(ngram_range > 1) {
# Create set of unique n-gram from the training set.
ngrams <- imdb_data$train$x %>%
map(create_ngram_set) %>%
unlist() %>%
unique()
# Dictionary mapping n-gram token to a unique integer
# Integer values are greater than max_features in order
# to avoid collision with existing features
token_indice <- data.frame(
ngrams = ngrams,
token = 1:length(ngrams) + (max_features),
stringsAsFactors = FALSE
)
# max_features is the highest integer that could be found in the dataset
max_features <- max(token_indice$token) + 1
# Augmenting x_train and x_test with n-grams features
imdb_data$train$x <- add_ngram(imdb_data$train$x, token_indice, ngram_range)
imdb_data$test$x <- add_ngram(imdb_data$test$x, token_indice, ngram_range)
}
# Pad sequences
imdb_data$train$x <- pad_sequences(imdb_data$train$x, maxlen = maxlen)
imdb_data$test$x <- pad_sequences(imdb_data$test$x, maxlen = maxlen)
# Model Definition --------------------------------------------------------
model <- keras_model_sequential()
model %>%
layer_embedding(
input_dim = max_features, output_dim = embedding_dims,
input_length = maxlen
) %>%
layer_global_average_pooling_1d() %>%
layer_dense(1, activation = "sigmoid")
model %>% compile(
loss = "binary_crossentropy",
optimizer = "adam",
metrics = "accuracy"
)
# Fitting -----------------------------------------------------------------
model %>% fit(
imdb_data$train$x, imdb_data$train$y,
batch_size = batch_size,
epochs = epochs,
validation_data = list(imdb_data$test$x, imdb_data$test$y)
)