{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from nb_007a import *\n", "from pandas import Series,DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Movie Lens" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PATH = Path('data/ml-latest-small/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Table user/movie -> rating" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings = pd.read_csv(PATH/'ratings.csv')\n", "ratings.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Table to get the titles of the movies." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies = pd.read_csv(PATH/'movies.csv')\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def series2cat(df:DataFrame, *col_names):\n", " \"Categorifies the columns in df.\"\n", " for c in listify(col_names): df[c] = df[c].astype('category').cat.as_ordered()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "series2cat(ratings, 'userId','movieId')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ratings.userId.dtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "@dataclass\n", "class ColabFilteringDataset(DatasetBase):\n", " \"Base dataset for collaborative filtering\"\n", " user:Series\n", " item:Series\n", " ratings:DataFrame\n", " def __post_init__(self):\n", " self.user_ids = np.array(self.user.cat.codes, dtype=np.int64)\n", " self.item_ids = np.array(self.item.cat.codes, dtype=np.int64)\n", " \n", " def __len__(self)->int: return len(self.ratings)\n", "\n", " def __getitem__(self, idx:int)->Tuple[Tuple[int,int],float]:\n", " return (self.user_ids[idx],self.item_ids[idx]), self.ratings[idx]\n", " \n", " @property\n", " def c(self) -> int: return 1\n", " \n", " @property\n", " def n_user(self)->int: return len(self.user.cat.categories)\n", " \n", " @property\n", " def n_item(self)->int: return len(self.item.cat.categories)\n", " \n", " @classmethod\n", " def from_df(cls, rating_df:DataFrame, pct_val:float=0.2, user_name:Optional[str]=None, item_name:Optional[str]=None, \n", " rating_name:Optional[str]=None) -> Tuple['ColabFilteringDataset','ColabFilteringDataset']:\n", " \"Splits a given dataframe in a training and validation set\"\n", " if user_name is None: user_name = rating_df.columns[0]\n", " if item_name is None: item_name = rating_df.columns[1]\n", " if rating_name is None: rating_name = rating_df.columns[2]\n", " user = rating_df[user_name]\n", " item = rating_df[item_name]\n", " ratings = np.array(rating_df[rating_name], dtype=np.float32)\n", " idx = np.random.permutation(len(ratings))\n", " cut = int(pct_val * len(ratings))\n", " return (cls(user[idx[cut:]], item[idx[cut:]], ratings[idx[cut:]]),\n", " cls(user[idx[:cut]], item[idx[:cut]], ratings[idx[:cut]]))\n", " \n", " @classmethod\n", " def from_csv(cls, csv_name:str, **kwargs) -> Tuple['ColabFilteringDataset','ColabFilteringDataset']:\n", " \"Splits a given table in a csv in a training and validation set\"\n", " df = pd.read_csv(csv_name)\n", " return cls.from_df(df, **kwargs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_ds, valid_ds = ColabFilteringDataset.from_df(ratings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(ratings), len(train_ds), len(valid_ds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bs = 64\n", "data = DataBunch.create(train_ds, valid_ds, bs=bs, num_workers=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def trunc_normal_(x:Tensor, mean:float=0., std:float=1.) -> Tensor:\n", " \"Truncated normal initialization\"\n", " # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12\n", " return x.normal_().fmod_(2).mul_(std).add_(mean)\n", "\n", "def get_embedding(ni:int,nf:int) -> Model:\n", " \"Creates an embedding layer\"\n", " emb = nn.Embedding(ni, nf)\n", " # See https://arxiv.org/abs/1711.09160\n", " with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)\n", " return emb\n", "\n", "class EmbeddingDotBias(nn.Module):\n", " \"Base model for callaborative filtering\"\n", " def __init__(self, n_factors:int, n_users:int, n_items:int, min_score:float=None, max_score:float=None):\n", " super().__init__()\n", " self.min_score,self.max_score = min_score,max_score\n", " (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [get_embedding(*o) for o in [\n", " (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)\n", " ]]\n", "\n", " def forward(self, users:LongTensor, items:LongTensor) -> Tensor:\n", " dot = self.u_weight(users)* self.i_weight(items)\n", " res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()\n", " if self.min_score is None: return res\n", " return torch.sigmoid(res) * (self.max_score-self.min_score) + self.min_score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def get_collab_learner(n_factors:int, data:DataBunch, min_score:float=None, max_score:float=None, \n", " loss_fn:LossFunction=F.mse_loss, **kwargs) -> Learner:\n", " \"Creates a Learner for collaborative filtering\"\n", " ds = data.train_ds\n", " model = EmbeddingDotBias(n_factors, ds.n_user, ds.n_item, min_score, max_score) \n", " return Learner(data, model, loss_fn=loss_fn, **kwargs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_factors = 50\n", "learn = get_collab_learner(n_factors, data, 0, 5, wd=1e-1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.lr_find()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.recorder.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.fit_one_cycle(5, 5e-3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "math.sqrt(0.77)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }