# Split BX

This splits the BookCrossing data for a train-test sweep.

In [None]:
library(readr)
library(dplyr)

In [None]:
options(repr.matrix.max.rows = 20)

## Explicit Ratings

In [None]:
ratings = read_csv("build/bx-ratings.csv", col_names = TRUE, col_types = 'iid') %>%
 rename(user=userID, item=bookID)

In [None]:
dim(ratings)

In [None]:
explicit.test.users = ratings %>%
 group_by(user) %>%
 summarize(nratings=n()) %>%
 filter(nratings >= 10) %>%
 sample_n(5000)
explicit.test.users
dim(explicit.test.users)

In [None]:
ratings.group = ratings %>%
 group_by(user) %>%
 mutate(urid = sample(n())) %>%
 ungroup()

In [None]:
test.ratings = explicit.test.users %>%
 select(user) %>%
 inner_join(ratings.group) %>%
 filter(urid <= 5) %>%
 select(user, item, rating)
dim(test.ratings)

In [None]:
train.ratings = explicit.test.users %>%
 right_join(ratings.group) %>%
 filter(urid > 5 | is.na(nratings)) %>%
 select(user, item, rating)
dim(train.ratings)

In [None]:
write_csv(test.ratings, "build/bx-ratings-test.csv")
write_csv(train.ratings, "build/bx-ratings-train.csv")

## Implicit Ratings

In [None]:
ratings = read_csv("build/bx-implicit.csv", col_names = TRUE, col_types = 'iid') %>%
 rename(user=userID, item=bookID)
dim(ratings)

In [None]:
implicit.test.users = ratings %>%
 group_by(user) %>%
 summarize(nratings=n()) %>%
 filter(nratings >= 10) %>%
 sample_n(5000)
implicit.test.users
dim(implicit.test.users)

In [None]:
ratings.group = ratings %>%
 group_by(user) %>%
 mutate(urid = sample(n())) %>%
 ungroup()

In [None]:
test.ratings = implicit.test.users %>%
 select(user) %>%
 inner_join(ratings.group) %>%
 filter(urid <= 5) %>%
 select(user, item, rating) %>%
 mutate(rating=1)
dim(test.ratings)

In [None]:
train.ratings = implicit.test.users %>%
 right_join(ratings.group) %>%
 filter(urid > 5 | is.na(nratings)) %>%
 select(user, item, rating)
dim(train.ratings)

In [None]:
write_csv(test.ratings, "build/bx-implicit-test.csv")
write_csv(train.ratings, "build/bx-implicit-train.csv")