{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import surprise # run 'pip install scikit-surprise' to install surprise" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class MatrixFacto(surprise.AlgoBase):\n", " '''A basic rating prediction algorithm based on matrix factorization.'''\n", " \n", " def __init__(self, learning_rate, n_epochs, n_factors):\n", " \n", " self.lr = learning_rate # learning rate for SGD\n", " self.n_epochs = n_epochs # number of iterations of SGD\n", " self.n_factors = n_factors # number of factors\n", " \n", " def fit(self, trainset):\n", " '''Learn the vectors p_u and q_i with SGD'''\n", " \n", " print('Fitting data with SGD...')\n", " \n", " # Randomly initialize the user and item factors.\n", " p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))\n", " q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))\n", " \n", " # SGD procedure\n", " for _ in range(self.n_epochs):\n", " for u, i, r_ui in trainset.all_ratings():\n", " err = r_ui - np.dot(p[u], q[i])\n", " # Update vectors p_u and q_i\n", " p[u] += self.lr * err * q[i]\n", " q[i] += self.lr * err * p[u]\n", " # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.\n", " # In practice it makes almost no difference.\n", " \n", " self.p, self.q = p, q\n", " self.trainset = trainset\n", "\n", " def estimate(self, u, i):\n", " '''Return the estmimated rating of user u for item i.'''\n", " \n", " # return scalar product between p_u and q_i if user and item are known,\n", " # else return the average of all ratings\n", " if self.trainset.knows_user(u) and self.trainset.knows_item(i):\n", " return np.dot(self.p[u], self.q[i])\n", " else:\n", " return self.trainset.global_mean" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)\n", "# it will be downloaded automatically.\n", "data = surprise.Dataset.load_builtin('ml-100k')\n", "data.split(2) # split data for 2-folds cross validation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluating RMSE of algorithm MatrixFacto.\n", "\n", "------------\n", "Fold 1\n", "Fitting data with SGD...\n", "RMSE: 0.9826\n", "------------\n", "Fold 2\n", "Fitting data with SGD...\n", "RMSE: 0.9873\n", "------------\n", "------------\n", "Mean RMSE: 0.9849\n", "------------\n", "------------\n" ] }, { "data": { "text/plain": [ "CaseInsensitiveDefaultDict(list,\n", " {'rmse': [0.98263312180825368, 0.9872549391926676]})" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)\n", "surprise.evaluate(algo, data, measures=['RMSE'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluating RMSE of algorithm KNNBasic.\n", "\n", "------------\n", "Fold 1\n", "Computing the msd similarity matrix...\n", "Done computing similarity matrix.\n", "RMSE: 1.0101\n", "------------\n", "Fold 2\n", "Computing the msd similarity matrix...\n", "Done computing similarity matrix.\n", "RMSE: 0.9982\n", "------------\n", "------------\n", "Mean RMSE: 1.0042\n", "------------\n", "------------\n" ] }, { "data": { "text/plain": [ "CaseInsensitiveDefaultDict(list,\n", " {'rmse': [1.0101383334175613, 0.99823558896449016]})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# try a neighborhood-based algorithm (on the same data)\n", "algo = surprise.KNNBasic()\n", "surprise.evaluate(algo, data, measures=['RMSE'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluating RMSE of algorithm SVD.\n", "\n", "------------\n", "Fold 1\n", "RMSE: 0.9604\n", "------------\n", "Fold 2\n", "RMSE: 0.9538\n", "------------\n", "------------\n", "Mean RMSE: 0.9571\n", "------------\n", "------------\n" ] }, { "data": { "text/plain": [ "CaseInsensitiveDefaultDict(list,\n", " {'rmse': [0.96042083843476056,\n", " 0.95382688332712151]})" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# try a more sophisticated matrix factorization algorithm (on the same data)\n", "algo = surprise.SVD()\n", "surprise.evaluate(algo, data, measures=['RMSE'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }