{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Split BX\n", "\n", "This splits the BookCrossing data for a train-test sweep." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "library(readr)\n", "library(dplyr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "options(repr.matrix.max.rows = 20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explicit Ratings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings = read_csv(\"build/bx-ratings.csv\", col_names = TRUE, col_types = 'iid') %>%\n", " rename(user=userID, item=bookID)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dim(ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit.test.users = ratings %>%\n", " group_by(user) %>%\n", " summarize(nratings=n()) %>%\n", " filter(nratings >= 10) %>%\n", " sample_n(5000)\n", "explicit.test.users\n", "dim(explicit.test.users)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ratings.group = ratings %>%\n", " group_by(user) %>%\n", " mutate(urid = sample(n())) %>%\n", " ungroup()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test.ratings = explicit.test.users %>%\n", " select(user) %>%\n", " inner_join(ratings.group) %>%\n", " filter(urid <= 5) %>%\n", " select(user, item, rating)\n", "dim(test.ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.ratings = explicit.test.users %>%\n", " right_join(ratings.group) %>%\n", " filter(urid > 5 | is.na(nratings)) %>%\n", " select(user, item, rating)\n", "dim(train.ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "write_csv(test.ratings, \"build/bx-ratings-test.csv\")\n", "write_csv(train.ratings, \"build/bx-ratings-train.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implicit Ratings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings = read_csv(\"build/bx-implicit.csv\", col_names = TRUE, col_types = 'iid') %>%\n", " rename(user=userID, item=bookID)\n", "dim(ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "implicit.test.users = ratings %>%\n", " group_by(user) %>%\n", " summarize(nratings=n()) %>%\n", " filter(nratings >= 10) %>%\n", " sample_n(5000)\n", "implicit.test.users\n", "dim(implicit.test.users)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ratings.group = ratings %>%\n", " group_by(user) %>%\n", " mutate(urid = sample(n())) %>%\n", " ungroup()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test.ratings = implicit.test.users %>%\n", " select(user) %>%\n", " inner_join(ratings.group) %>%\n", " filter(urid <= 5) %>%\n", " select(user, item, rating) %>%\n", " mutate(rating=1)\n", "dim(test.ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.ratings = implicit.test.users %>%\n", " right_join(ratings.group) %>%\n", " filter(urid > 5 | is.na(nratings)) %>%\n", " select(user, item, rating)\n", "dim(train.ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "write_csv(test.ratings, \"build/bx-implicit-test.csv\")\n", "write_csv(train.ratings, \"build/bx-implicit-train.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.4.2" } }, "nbformat": 4, "nbformat_minor": 2 }